浏览代码

Add starpu_get_next_bindid and starpu_bind_thread_on

to allow binding an application-started thread on a free core. Use it in
StarPU-MPI to automatically bind the MPI thread on an available core.
Samuel Thibault 6 年之前
父节点
当前提交
886cc6b57f

+ 3 - 0
ChangeLog

@@ -55,6 +55,9 @@ New features:
   * Add STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU, and
     STARPU_SPECIFIC_NODE_SLOW as generic values for codelet specific memory
     nodes which can be used instead of exact node numbers.
+  * Add starpu_get_next_bindid and starpu_bind_thread_on to allow binding an
+    application-started thread on a free core. Use it in StarPU-MPI to
+    automatically bind the MPI thread on an available core.
 
 Small features:
   * Scheduling contexts may now be associated a user data pointer at creation

+ 27 - 0
doc/doxygen/chapters/api/initialization.doxy

@@ -291,6 +291,33 @@ This is StarPU termination method. It must be called at the end of the
 application: statistics and other post-mortem debugging information
 are not guaranteed to be available until this method has been called.
 
+\def STARPU_THREAD_ACTIVE
+\ingroup API_Initialization_and_Termination
+This flag should be passed to starpu_get_next_bindid() and
+starpu_bind_thread_on() when binding a thread which will significantly eat CPU
+time, and should thus have its own dedicated CPU.
+
+\fn int starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred)
+\ingroup API_Initialization_and_Termination
+This returns a PU binding ID which can be used to bind threads with
+starpu_bind_thread_on(). \p flags can be set to STARPU_THREAD_ACTIVE or 0.
+When \p npreferred is set to non-zero, \p preferred is an array of size \p
+npreferred in which a preference of PU binding IDs can be set. By default StarPU
+will return the first PU available for binding.
+
+\fn int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name)
+\ingroup API_Initialization_and_Termination
+This binds the calling thread on the given \p cpuid (which should have been
+obtained with starpu_get_next_bindid()).
+
+This returns -1 if a thread was already bound to this PU (but binding will still
+have been done, and a warning will have been printed), so the caller can tell
+the user how to avoid the issue.
+
+\p name should be set to a unique string so that different calls with the same
+name for the same cpuid does not produce a warning.
+
+
 \fn void starpu_pause(void)
 \ingroup API_Initialization_and_Termination
 Suspend the processing of new tasks by

+ 4 - 0
include/starpu.h

@@ -154,6 +154,10 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
 int starpu_is_initialized(void);
 void starpu_wait_initialized(void);
 
+#define STARPU_THREAD_ACTIVE (1 << 0)
+unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred);
+int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name);
+
 void starpu_pause(void);
 void starpu_resume(void);
 

+ 13 - 3
mpi/src/mpi/starpu_mpi_mpi.c

@@ -1118,12 +1118,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	starpu_pthread_setname("MPI");
 
 #ifndef STARPU_SIMGRID
-	if (_starpu_mpi_thread_cpuid >= 0)
-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
+	if (_starpu_mpi_thread_cpuid < 0)
+	{
+		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
+	}
+
+	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
+	{
+#ifdef STARPU_DEVEL
+#warning we should make this automatic by adding a CPU reservation field in starpu_config
+#endif
+		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_NCPU to leave one core available for MPI\n");
+	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
+		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #endif
 
 	_starpu_mpi_env_init();

+ 13 - 5
mpi/src/nmad/starpu_mpi_nmad.c

@@ -477,15 +477,23 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 {
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
-	starpu_pthread_setname("MPI");
-
 #ifndef STARPU_SIMGRID
-	if (_starpu_mpi_thread_cpuid >= 0)
-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
+	if (_starpu_mpi_thread_cpuid < 0)
+	{
+		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
+	}
+
+	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
+	{
+#ifdef STARPU_DEVEL
+#warning we should make this automatic by adding a CPU reservation field in starpu_config
+#endif
+		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_NCPU to leave one core available for MPI\n");
+	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
+		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #endif
 
 	_starpu_mpi_env_init();

+ 3 - 3
src/core/perfmodel/perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2016-2017                      Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2015,2017                           CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -89,10 +89,10 @@ void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);
 
 #if defined(STARPU_USE_CUDA)
-int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
+unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 #endif
 #if defined(STARPU_USE_OPENCL)
-int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
+unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid);
 #endif
 
 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read,

+ 22 - 22
src/core/perfmodel/perfmodel_bus.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014,2016-2017                      Inria
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2013                                     Corentin Salingue
  *
@@ -104,7 +104,7 @@ static uint64_t cuda_size[STARPU_MAXCUDADEVS];
 #endif
 #ifdef STARPU_USE_CUDA
 /* preference order of cores (logical indexes) */
-static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
+static unsigned cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
 
 #ifndef STARPU_SIMGRID
 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
@@ -121,7 +121,7 @@ static uint64_t opencl_size[STARPU_MAXCUDADEVS];
 #endif
 #ifdef STARPU_USE_OPENCL
 /* preference order of cores (logical indexes) */
-static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
+static unsigned opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
 static struct dev_timing opencldev_timing_per_numa[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
 #endif
 
@@ -152,7 +152,7 @@ hwloc_topology_t _starpu_perfmodel_get_hwtopology()
 static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 	size_t size = SIZE;
 
 	const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
@@ -163,13 +163,13 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	cudaSetDevice(dev);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	/* hack to force the initialization */
 	cudaFree(0);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
         /* Get the maximum size which can be allocated on the device */
 	struct cudaDeviceProp prop;
@@ -185,7 +185,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	STARPU_ASSERT(cures == cudaSuccess);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
@@ -212,14 +212,14 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	STARPU_ASSERT(cures == cudaSuccess);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	/* Fill them */
 	memset(h_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
 	unsigned iter;
@@ -410,7 +410,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 	int not_initialized;
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
 
@@ -444,7 +444,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 	}
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	/* Allocate a buffer on the device */
 	cl_mem d_buffer;
@@ -452,7 +452,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 #if defined(STARPU_HAVE_HWLOC)
@@ -474,14 +474,14 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 	}
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 	/* Fill them */
 	memset(h_buffer, 0, size);
 	err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 	clFinish(queue);
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
 
 	const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
 	unsigned iter;
@@ -898,7 +898,7 @@ static void load_bus_affinity_file_content(void)
 		unsigned numa;
 		for (numa = 0; numa < nnumas; numa++)
 		{
-			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][numa]);
+			ret = fscanf(f, "%u\t", &cuda_affinity_matrix[gpu][numa]);
 			STARPU_ASSERT(ret == 1);
 		}
 
@@ -922,7 +922,7 @@ static void load_bus_affinity_file_content(void)
 		unsigned numa;
 		for (numa = 0; numa < nnumas; numa++)
 		{
-			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][numa]);
+			ret = fscanf(f, "%u\t", &opencl_affinity_matrix[gpu][numa]);
 			STARPU_ASSERT(ret == 1);
 		}
 
@@ -1092,14 +1092,14 @@ static void load_bus_affinity_file(void)
 }
 
 #ifdef STARPU_USE_CUDA
-int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
+unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid)
 {
 	return cuda_affinity_matrix[gpuid];
 }
 #endif /* STARPU_USE_CUDA */
 
 #ifdef STARPU_USE_OPENCL
-int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
+unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 {
 	return opencl_affinity_matrix[gpuid];
 }
@@ -1121,7 +1121,7 @@ void starpu_bus_print_affinity(FILE *f)
 		fprintf(f, "%u\t", gpu);
 		for (numa = 0; numa < nnumas; numa++)
 		{
-			fprintf(f, "%d\t", cuda_affinity_matrix[gpu][numa]);
+			fprintf(f, "%u\t", cuda_affinity_matrix[gpu][numa]);
 		}
 		fprintf(f, "\n");
 	}
@@ -1133,7 +1133,7 @@ void starpu_bus_print_affinity(FILE *f)
 		fprintf(f, "%u\t", gpu);
 		for (numa = 0; numa < nnumas; numa++)
 		{
-			fprintf(f, "%d\t", opencl_affinity_matrix[gpu][numa]);
+			fprintf(f, "%u\t", opencl_affinity_matrix[gpu][numa]);
 		}
 		fprintf(f, "\n");
 	}
@@ -1894,7 +1894,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 				if (timing->timing_htod)
 					fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
 				else
-					fprintf(f, "%2d\t", cuda_affinity_matrix[src][numa]);
+					fprintf(f, "%2u\t", cuda_affinity_matrix[src][numa]);
 			}
 		}
 #ifdef STARPU_USE_OPENCL
@@ -1910,7 +1910,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 				if (timing->timing_htod)
 					fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
 				else
-					fprintf(f, "%2d\t", opencl_affinity_matrix[src][numa]);
+					fprintf(f, "%2u\t", opencl_affinity_matrix[src][numa]);
 			}
 		}
 #endif

+ 2 - 2
src/core/sched_ctx.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2018                                Inria
  * Copyright (C) 2012-2018                                CNRS
- * Copyright (C) 2012-2017                                Université de Bordeaux
+ * Copyright (C) 2012-2018                                Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -2185,7 +2185,7 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 
 void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
 {
-	_starpu_bind_thread_on_cpu(cpuid, STARPU_NOWORKERID);
+	_starpu_bind_thread_on_cpu(cpuid, STARPU_NOWORKERID, NULL);
 }
 
 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)

+ 120 - 37
src/core/topology.c

@@ -72,6 +72,7 @@ static int numa_enabled = -1;
 
 /* For checking whether two workers share the same PU, indexed by PU number */
 static int cpu_worker[STARPU_MAXCPUS];
+static char * cpu_name[STARPU_MAXCPUS];
 static unsigned nb_numa_nodes = 0;
 static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
 static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
@@ -1023,20 +1024,38 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 
 	/* no binding yet */
 	memset(&config->currently_bound, 0, sizeof(config->currently_bound));
+	memset(&config->currently_shared, 0, sizeof(config->currently_shared));
+}
+
+static void
+_starpu_deinitialize_workers_bindid (struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
+{
+	unsigned i;
+
+	for (i = 0; i < STARPU_MAXCPUS;i++)
+	{
+		if (cpu_name[i])
+		{
+			free(cpu_name[i]);
+			cpu_name[i] = NULL;
+		}
+	}
+
 }
 
 /* This function gets the identifier of the next core on which to bind a
  * worker. In case a list of preferred cores was specified (logical indexes),
  * we look for a an available core among the list if possible, otherwise a
  * round-robin policy is used. */
-static inline int
-_starpu_get_next_bindid (struct _starpu_machine_config *config,
-			 int *preferred_binding, int npreferred)
+static inline unsigned
+_starpu_get_next_bindid (struct _starpu_machine_config *config, unsigned flags,
+			 unsigned *preferred_binding, unsigned npreferred)
 {
 	struct _starpu_machine_topology *topology = &config->topology;
 
-	int current_preferred;
-	int nhyperthreads = topology->nhwpus / topology->nhwcpus;
+	unsigned current_preferred;
+	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
+	unsigned ncores = topology->nhwpus / nhyperthreads;
 	unsigned i;
 
 	if (npreferred)
@@ -1049,32 +1068,42 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 	     current_preferred < npreferred;
 	     current_preferred++)
 	{
-		/* Try to get this core */
+		/* can we bind the worker on the preferred core ? */
 		unsigned requested_core = preferred_binding[current_preferred];
 		unsigned requested_bindid = requested_core * nhyperthreads;
 
-		/* can we bind the worker on the preferred core ? */
-		unsigned ind;
 		/* Look at the remaining cores to be bound to */
-		for (ind = 0;
-		     ind < topology->nhwpus / nhyperthreads;
-		     ind++)
+		for (i = 0; i < ncores; i++)
 		{
-			if (topology->workers_bindid[ind] == requested_bindid && !config->currently_bound[ind])
+			if (topology->workers_bindid[i] == requested_bindid &&
+					(!config->currently_bound[i] ||
+					 (config->currently_shared[i] && !(flags & STARPU_THREAD_ACTIVE)))
+					)
 			{
-				/* the cpu is available, we use it ! */
-				config->currently_bound[ind] = 1;
+				/* the cpu is available, or shareable with us, we use it ! */
+				config->currently_bound[i] = 1;
+				if (!(flags & STARPU_THREAD_ACTIVE))
+					config->currently_shared[i] = 1;
 				return requested_bindid;
 			}
 		}
 	}
 
-	for (i = config->current_bindid; i < topology->nhwpus / nhyperthreads; i++)
+	if (!(flags & STARPU_THREAD_ACTIVE))
+	{
+		/* Try to find a shareable PU */
+		for (i = 0; i < ncores; i++)
+			if (config->currently_shared[i])
+				return topology->workers_bindid[i];
+	}
+
+	/* Try to find an available PU from last used PU */
+	for (i = config->current_bindid; i < ncores; i++)
 		if (!config->currently_bound[i])
 			/* Found a cpu ready for use, use it! */
 			break;
 
-	if (i == topology->nhwpus / nhyperthreads)
+	if (i == ncores)
 	{
 		/* Finished binding on all cpus, restart from start in
 		 * case the user really wants overloading */
@@ -1082,13 +1111,20 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 		i = 0;
 	}
 
-	STARPU_ASSERT(i < topology->nhwpus / nhyperthreads);
-	int bindid = topology->workers_bindid[i];
+	STARPU_ASSERT(i < ncores);
+	unsigned bindid = topology->workers_bindid[i];
 	config->currently_bound[i] = 1;
+	if (!(flags & STARPU_THREAD_ACTIVE))
+		config->currently_shared[i] = 1;
 	config->current_bindid = i;
 	return bindid;
 }
 
+unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred)
+{
+	return _starpu_get_next_bindid(_starpu_get_machine_config(), flags, preferred, npreferred);
+}
+
 unsigned
 _starpu_topology_get_nhwcpu (struct _starpu_machine_config *config)
 {
@@ -1930,17 +1966,18 @@ void _starpu_destroy_machine_config(struct _starpu_machine_config *config)
 #endif
 }
 
-void
+int
 _starpu_bind_thread_on_cpu (
-	int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED)
+		int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, const char *name)
 {
+	int ret = 0;
 #ifdef STARPU_SIMGRID
-	return;
+	return ret;
 #else
 	if (nobind > 0)
-		return;
+		return ret;
 	if (cpuid < 0)
-		return;
+		return ret;
 
 #ifdef STARPU_HAVE_HWLOC
 	const struct hwloc_topology_support *support;
@@ -1956,11 +1993,41 @@ _starpu_bind_thread_on_cpu (
 
 	if (workerid != STARPU_NOWORKERID && cpuid < STARPU_MAXCPUS)
 	{
+/* TODO: mutex... */
 		int previous = cpu_worker[cpuid];
-		if (previous != STARPU_NOWORKERID && previous != workerid)
-			_STARPU_DISP("Warning: both workers %d and %d are bound to the same PU %d, this will strongly degrade performance. Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", previous, workerid, cpuid, config->topology.nhwcpus, config->topology.nhwpus);
+		/* We would like the PU to be available, or we are perhaps fine to share it */
+		if ( !(  previous == STARPU_NOWORKERID ||
+			(previous == STARPU_NONACTIVETHREAD && workerid == STARPU_NONACTIVETHREAD) ||
+			(previous >= 0 && previous == workerid) ||
+			(name && cpu_name[cpuid] && !strcmp(name, cpu_name[cpuid])) ) )
+		{
+			if (previous == STARPU_ACTIVETHREAD)
+				_STARPU_DISP("Warning: active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+			else if (previous == STARPU_NONACTIVETHREAD)
+				_STARPU_DISP("Warning: non-active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+			else
+				_STARPU_DISP("Warning: worker %d was already bound to PU %d\n", previous, cpuid);
+
+			if (workerid == STARPU_ACTIVETHREAD)
+				_STARPU_DISP("and we were told to also bind active thread %s to it.\n", name);
+			else if (previous == STARPU_NONACTIVETHREAD)
+				_STARPU_DISP("and we were told to also bind non-active thread %s to it.\n", name);
+			else
+				_STARPU_DISP("and we were told to also bind worker %d to it.\n", workerid);
+
+			_STARPU_DISP("This will strongly degrade performance.\n");
+
+			if (workerid >= 0)
+				/* This shouldn't happen for workers */
+				_STARPU_DISP("Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", config->topology.nhwcpus, config->topology.nhwpus);
+			ret = -1;
+		}
 		else
+		{
 			cpu_worker[cpuid] = workerid;
+			if (name)
+				cpu_name[cpuid] = strdup(name);
+		}
 	}
 
 	support = hwloc_topology_get_support (config->topology.hwtopology);
@@ -1970,12 +2037,12 @@ _starpu_bind_thread_on_cpu (
 			hwloc_get_obj_by_depth (config->topology.hwtopology,
 						config->pu_depth, cpuid);
 		hwloc_bitmap_t set = obj->cpuset;
-		int ret;
+		int res;
 
 		hwloc_bitmap_singlify(set);
-		ret = hwloc_set_cpubind (config->topology.hwtopology, set,
+		res = hwloc_set_cpubind (config->topology.hwtopology, set,
 					 HWLOC_CPUBIND_THREAD);
-		if (ret)
+		if (res)
 		{
 			perror("hwloc_set_cpubind");
 			STARPU_ABORT();
@@ -2009,6 +2076,20 @@ _starpu_bind_thread_on_cpu (
 #warning no CPU binding support
 #endif
 #endif
+	return ret;
+}
+
+int
+starpu_bind_thread_on(int cpuid, unsigned flags, const char *name)
+{
+	int workerid;
+	STARPU_ASSERT_MSG(name, "starpu_bind_thread_on must be provided with a name");
+	starpu_pthread_setname(name);
+	if (flags & STARPU_THREAD_ACTIVE)
+		workerid = STARPU_ACTIVETHREAD;
+	else
+		workerid = STARPU_NONACTIVETHREAD;
+	return _starpu_bind_thread_on_cpu(cpuid, workerid, name);
 }
 
 void
@@ -2065,7 +2146,7 @@ static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
 			case STARPU_CPU_WORKER:
 			{
 				/* Dedicate a cpu core to that worker */
-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
+				workerarg->bindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, NULL, 0);
 				break;
 			}
 			default:
@@ -2485,8 +2566,8 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 		/* Perhaps the worker has some "favourite" bindings  */
-		int *preferred_binding = NULL;
-		int npreferred = 0;
+		unsigned *preferred_binding = NULL;
+		unsigned npreferred = 0;
 #endif
 
 		/* select the memory node that contains worker's memory */
@@ -2530,7 +2611,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 					if (config->topology.cuda_th_per_stream == 0)
 						workerarg->bindid = cuda_bindid[devid];
 					else
-						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+						workerarg->bindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 				}
 				else
 				{
@@ -2538,11 +2619,11 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 					if (config->topology.cuda_th_per_dev == 0 && config->topology.cuda_th_per_stream == 0)
 					{
 						if (cuda_globalbindid == -1)
-							cuda_globalbindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+							cuda_globalbindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 						workerarg->bindid = cuda_bindid[devid] = cuda_globalbindid;
 					}
 					else
-						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 
 					for (numa = 0; numa < nb_numa_nodes; numa++)
@@ -2639,7 +2720,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				else
 				{
 					opencl_init[devid] = 1;
-					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
 
 					for (numa = 0; numa < nb_numa_nodes; numa++)
@@ -2684,7 +2765,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 						//	preferred_binding = _starpu_get_mic_affinity_vector(devid);
 					//	npreferred = config->topology.nhwpus;
 					//}
-					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+					mic_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
 
 					for (numa = 0; numa < nb_numa_nodes; numa++)
@@ -2737,7 +2818,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				else
 				{
 					mpi_init[devid] = 1;
-					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+					mpi_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
 
 					for (numa = 0; numa < nb_numa_nodes; numa++)
@@ -2951,6 +3032,8 @@ void _starpu_destroy_topology(struct _starpu_machine_config *config STARPU_ATTRI
 	_starpu_memory_nodes_deinit();
 
 	_starpu_destroy_machine_config(config);
+
+	_starpu_deinitialize_workers_bindid(config);
 }
 
 void

+ 3 - 1
src/core/topology.h

@@ -63,10 +63,12 @@ void _starpu_topology_filter(hwloc_topology_t topology);
 #endif
 
 #define STARPU_NOWORKERID -1
+#define STARPU_ACTIVETHREAD -2
+#define STARPU_NONACTIVETHREAD -2
 /* Bind the current thread on the CPU logically identified by "cpuid". The
  * logical ordering of the processors is either that of hwloc (if available),
  * or the ordering exposed by the OS. */
-void _starpu_bind_thread_on_cpu(int cpuid, int workerid);
+int _starpu_bind_thread_on_cpu(int cpuid, int workerid, const char *name);
 
 struct _starpu_combined_worker;
 /* Bind the current thread on the set of CPUs for the given combined worker. */

+ 2 - 2
src/core/workers.c

@@ -657,7 +657,7 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 	STARPU_PTHREAD_COND_SIGNAL(&worker->started_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
 
-	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid);
+	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid, NULL);
 
 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
 	setitimer(ITIMER_PROF, &prof_itimer, NULL);
@@ -1471,7 +1471,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	int main_thread_cpuid = starpu_get_env_number_default("STARPU_MAIN_THREAD_CPUID", -1);
 	if (main_thread_cpuid >= 0)
-		_starpu_bind_thread_on_cpu(main_thread_cpuid, STARPU_NOWORKERID);
+		_starpu_bind_thread_on_cpu(main_thread_cpuid, STARPU_NONACTIVETHREAD, "main");
 
 	_STARPU_DEBUG("Initialisation finished\n");
 

+ 1 - 0
src/core/workers.h

@@ -366,6 +366,7 @@ struct _starpu_machine_config
 	/* Where to bind next worker ? */
 	int current_bindid;
 	char currently_bound[STARPU_NMAXWORKERS];
+	char currently_shared[STARPU_NMAXWORKERS];
 
 	/* Which GPU(s) do we use for CUDA ? */
 	int current_cuda_gpuid;

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -105,7 +105,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		}
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */
-			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
+			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid, NULL);
 	}
 	else
 	{

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -701,7 +701,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 	}
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker0->bindid, worker0->workerid);
+	_starpu_bind_thread_on_cpu(worker0->bindid, worker0->workerid, NULL);
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	{

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012                                Inria
- * Copyright (C) 2008-2015,2017                           Université de Bordeaux
+ * Copyright (C) 2008-2015,2017-2018                           Université de Bordeaux
  * Copyright (C) 2010-2011,2013,2015-2017                 CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -56,7 +56,7 @@ void *gordon_worker_progress(void *arg)
 	struct _starpu_worker_set *gordon_set_arg = arg;
 	unsigned prog_thread_bind_id =
 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
-	_starpu_bind_thread_on_cpu(prog_thread_bind_id, gordon_set_arg->workers[0].workerid);
+	_starpu_bind_thread_on_cpu(prog_thread_bind_id, gordon_set_arg->workers[0].workerid, NULL);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	progress_thread_is_inited = 1;
@@ -441,7 +441,7 @@ void *_starpu_gordon_worker(void *arg)
 {
 	struct _starpu_worker_set *gordon_set_arg = arg;
 
-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid);
+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid, NULL);
 
 	/* TODO set_local_memory_node per SPU */
 	gordon_init(gordon_set_arg->nworkers);

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -629,7 +629,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	_starpu_opencl_init_context(devid);
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid);
+	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid, NULL);
 
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));