6 年之前 · 886cc6b57f
--- a/ChangeLog
+++ b/ChangeLog
@@ -55,6 +55,9 @@ New features:
 
				   * Add STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU, and
			
 
				     STARPU_SPECIFIC_NODE_SLOW as generic values for codelet specific memory
			
 
				     nodes which can be used instead of exact node numbers.
			
 
				+  * Add starpu_get_next_bindid and starpu_bind_thread_on to allow binding an
			
 
				+    application-started thread on a free core. Use it in StarPU-MPI to
			
 
				+    automatically bind the MPI thread on an available core.
			
 
				 
			
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
--- a/doc/doxygen/chapters/api/initialization.doxy
+++ b/doc/doxygen/chapters/api/initialization.doxy
@@ -291,6 +291,33 @@ This is StarPU termination method. It must be called at the end of the
 
				 application: statistics and other post-mortem debugging information
			
 
				 are not guaranteed to be available until this method has been called.
			
 
				 
			
 
				+\def STARPU_THREAD_ACTIVE
			
 
				+\ingroup API_Initialization_and_Termination
			
 
				+This flag should be passed to starpu_get_next_bindid() and
			
 
				+starpu_bind_thread_on() when binding a thread which will significantly eat CPU
			
 
				+time, and should thus have its own dedicated CPU.
			
 
				+
			
 
				+\fn int starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred)
			
 
				+\ingroup API_Initialization_and_Termination
			
 
				+This returns a PU binding ID which can be used to bind threads with
			
 
				+starpu_bind_thread_on(). \p flags can be set to STARPU_THREAD_ACTIVE or 0.
			
 
				+When \p npreferred is set to non-zero, \p preferred is an array of size \p
			
 
				+npreferred in which a preference of PU binding IDs can be set. By default StarPU
			
 
				+will return the first PU available for binding.
			
 
				+
			
 
				+\fn int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name)
			
 
				+\ingroup API_Initialization_and_Termination
			
 
				+This binds the calling thread on the given \p cpuid (which should have been
			
 
				+obtained with starpu_get_next_bindid()).
			
 
				+
			
 
				+This returns -1 if a thread was already bound to this PU (but binding will still
			
 
				+have been done, and a warning will have been printed), so the caller can tell
			
 
				+the user how to avoid the issue.
			
 
				+
			
 
				+\p name should be set to a unique string so that different calls with the same
			
 
				+name for the same cpuid does not produce a warning.
			
 
				+
			
 
				+
			
 
				 \fn void starpu_pause(void)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				 Suspend the processing of new tasks by
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -154,6 +154,10 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
 
				 int starpu_is_initialized(void);
			
 
				 void starpu_wait_initialized(void);
			
 
				 
			
 
				+#define STARPU_THREAD_ACTIVE (1 << 0)
			
 
				+unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred);
			
 
				+int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name);
			
 
				+
			
 
				 void starpu_pause(void);
			
 
				 void starpu_resume(void);
			
 
				 
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -1118,12 +1118,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	starpu_pthread_setname("MPI");
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (_starpu_mpi_thread_cpuid >= 0)
			
 
				-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
			
 
				+	if (_starpu_mpi_thread_cpuid < 0)
			
 
				+	{
			
 
				+		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
			
 
				+	{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning we should make this automatic by adding a CPU reservation field in starpu_config
			
 
				+#endif
			
 
				+		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_NCPU to leave one core available for MPI\n");
			
 
				+	}
			
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				 	if (_starpu_mpi_thread_cpuid >= 0)
			
 
				 		/* In case MPI changed the binding */
			
 
				-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
			
 
				+		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
			
 
				 #endif
			
 
				 
			
 
				 	_starpu_mpi_env_init();
			
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -477,15 +477,23 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 {
			
 
				 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
			
 
				 
			
 
				-	starpu_pthread_setname("MPI");
			
 
				-
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (_starpu_mpi_thread_cpuid >= 0)
			
 
				-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
			
 
				+	if (_starpu_mpi_thread_cpuid < 0)
			
 
				+	{
			
 
				+		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
			
 
				+	{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning we should make this automatic by adding a CPU reservation field in starpu_config
			
 
				+#endif
			
 
				+		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_NCPU to leave one core available for MPI\n");
			
 
				+	}
			
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				 	if (_starpu_mpi_thread_cpuid >= 0)
			
 
				 		/* In case MPI changed the binding */
			
 
				-		_starpu_bind_thread_on_cpu(_starpu_mpi_thread_cpuid, STARPU_NOWORKERID);
			
 
				+		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
			
 
				 #endif
			
 
				 
			
 
				 	_starpu_mpi_env_init();
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2016-2017                      Inria
			
 
				- * Copyright (C) 2008-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2008-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2010-2015,2017                           CNRS
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  *
			
@@ -89,10 +89,10 @@ void _starpu_set_calibrate_flag(unsigned val);
 
				 unsigned _starpu_get_calibrate_flag(void);
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA)
			
 
				-int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
			
 
				+unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL)
			
 
				-int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
			
 
				+unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read,
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2014,2016-2017                      Inria
			
 
				- * Copyright (C) 2009-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2010-2017                                CNRS
			
 
				  * Copyright (C) 2013                                     Corentin Salingue
			
 
				  *
			
@@ -104,7 +104,7 @@ static uint64_t cuda_size[STARPU_MAXCUDADEVS];
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 /* preference order of cores (logical indexes) */
			
 
				-static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
			
 
				+static unsigned cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
@@ -121,7 +121,7 @@ static uint64_t opencl_size[STARPU_MAXCUDADEVS];
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 /* preference order of cores (logical indexes) */
			
 
				-static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
			
 
				+static unsigned opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
			
 
				 static struct dev_timing opencldev_timing_per_numa[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
			
 
				 #endif
			
 
				 
			
@@ -152,7 +152,7 @@ hwloc_topology_t _starpu_perfmodel_get_hwtopology()
 
				 static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 	size_t size = SIZE;
			
 
				 
			
 
				 	const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
			
@@ -163,13 +163,13 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	cudaSetDevice(dev);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	/* hack to force the initialization */
			
 
				 	cudaFree(0);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				         /* Get the maximum size which can be allocated on the device */
			
 
				 	struct cudaDeviceProp prop;
			
@@ -185,7 +185,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	STARPU_ASSERT(cures == cudaSuccess);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	/* Allocate a buffer on the host */
			
 
				 	unsigned char *h_buffer;
			
@@ -212,14 +212,14 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	STARPU_ASSERT(cures == cudaSuccess);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	/* Fill them */
			
 
				 	memset(h_buffer, 0, size);
			
 
				 	cudaMemset(d_buffer, 0, size);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
			
 
				 	unsigned iter;
			
@@ -410,7 +410,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 
				 	int not_initialized;
			
 
				 
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
			
 
				 
			
@@ -444,7 +444,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 
				 	}
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	cl_mem d_buffer;
			
@@ -452,7 +452,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 	/* Allocate a buffer on the host */
			
 
				 	unsigned char *h_buffer;
			
 
				 #if defined(STARPU_HAVE_HWLOC)
			
@@ -474,14 +474,14 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev,
 
				 	}
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 	/* Fill them */
			
 
				 	memset(h_buffer, 0, size);
			
 
				 	err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
			
 
				 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	clFinish(queue);
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				-	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
 
				 
			
 
				 	const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
			
 
				 	unsigned iter;
			
@@ -898,7 +898,7 @@ static void load_bus_affinity_file_content(void)
 
				 		unsigned numa;
			
 
				 		for (numa = 0; numa < nnumas; numa++)
			
 
				 		{
			
 
				-			ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][numa]);
			
 
				+			ret = fscanf(f, "%u\t", &cuda_affinity_matrix[gpu][numa]);
			
 
				 			STARPU_ASSERT(ret == 1);
			
 
				 		}
			
 
				 
			
@@ -922,7 +922,7 @@ static void load_bus_affinity_file_content(void)
 
				 		unsigned numa;
			
 
				 		for (numa = 0; numa < nnumas; numa++)
			
 
				 		{
			
 
				-			ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][numa]);
			
 
				+			ret = fscanf(f, "%u\t", &opencl_affinity_matrix[gpu][numa]);
			
 
				 			STARPU_ASSERT(ret == 1);
			
 
				 		}
			
 
				 
			
@@ -1092,14 +1092,14 @@ static void load_bus_affinity_file(void)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
			
 
				+unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid)
			
 
				 {
			
 
				 	return cuda_affinity_matrix[gpuid];
			
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
			
 
				+unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid)
			
 
				 {
			
 
				 	return opencl_affinity_matrix[gpuid];
			
 
				 }
			
@@ -1121,7 +1121,7 @@ void starpu_bus_print_affinity(FILE *f)
 
				 		fprintf(f, "%u\t", gpu);
			
 
				 		for (numa = 0; numa < nnumas; numa++)
			
 
				 		{
			
 
				-			fprintf(f, "%d\t", cuda_affinity_matrix[gpu][numa]);
			
 
				+			fprintf(f, "%u\t", cuda_affinity_matrix[gpu][numa]);
			
 
				 		}
			
 
				 		fprintf(f, "\n");
			
 
				 	}
			
@@ -1133,7 +1133,7 @@ void starpu_bus_print_affinity(FILE *f)
 
				 		fprintf(f, "%u\t", gpu);
			
 
				 		for (numa = 0; numa < nnumas; numa++)
			
 
				 		{
			
 
				-			fprintf(f, "%d\t", opencl_affinity_matrix[gpu][numa]);
			
 
				+			fprintf(f, "%u\t", opencl_affinity_matrix[gpu][numa]);
			
 
				 		}
			
 
				 		fprintf(f, "\n");
			
 
				 	}
			
@@ -1894,7 +1894,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 				if (timing->timing_htod)
			
 
				 					fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
			
 
				 				else
			
 
				-					fprintf(f, "%2d\t", cuda_affinity_matrix[src][numa]);
			
 
				+					fprintf(f, "%2u\t", cuda_affinity_matrix[src][numa]);
			
 
				 			}
			
 
				 		}
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -1910,7 +1910,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 				if (timing->timing_htod)
			
 
				 					fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
			
 
				 				else
			
 
				-					fprintf(f, "%2d\t", opencl_affinity_matrix[src][numa]);
			
 
				+					fprintf(f, "%2u\t", opencl_affinity_matrix[src][numa]);
			
 
				 			}
			
 
				 		}
			
 
				 #endif
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2018                                Inria
			
 
				  * Copyright (C) 2012-2018                                CNRS
			
 
				- * Copyright (C) 2012-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2012-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2016                                     Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -2185,7 +2185,7 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 
				 
			
 
				 void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
			
 
				 {
			
 
				-	_starpu_bind_thread_on_cpu(cpuid, STARPU_NOWORKERID);
			
 
				+	_starpu_bind_thread_on_cpu(cpuid, STARPU_NOWORKERID, NULL);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -72,6 +72,7 @@ static int numa_enabled = -1;
 
				 
			
 
				 /* For checking whether two workers share the same PU, indexed by PU number */
			
 
				 static int cpu_worker[STARPU_MAXCPUS];
			
 
				+static char * cpu_name[STARPU_MAXCPUS];
			
 
				 static unsigned nb_numa_nodes = 0;
			
 
				 static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
			
 
				 static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
			
@@ -1023,20 +1024,38 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 
				 
			
 
				 	/* no binding yet */
			
 
				 	memset(&config->currently_bound, 0, sizeof(config->currently_bound));
			
 
				+	memset(&config->currently_shared, 0, sizeof(config->currently_shared));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+_starpu_deinitialize_workers_bindid (struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < STARPU_MAXCPUS;i++)
			
 
				+	{
			
 
				+		if (cpu_name[i])
			
 
				+		{
			
 
				+			free(cpu_name[i]);
			
 
				+			cpu_name[i] = NULL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 }
			
 
				 
			
 
				 /* This function gets the identifier of the next core on which to bind a
			
 
				  * worker. In case a list of preferred cores was specified (logical indexes),
			
 
				  * we look for a an available core among the list if possible, otherwise a
			
 
				  * round-robin policy is used. */
			
 
				-static inline int
			
 
				-_starpu_get_next_bindid (struct _starpu_machine_config *config,
			
 
				-			 int *preferred_binding, int npreferred)
			
 
				+static inline unsigned
			
 
				+_starpu_get_next_bindid (struct _starpu_machine_config *config, unsigned flags,
			
 
				+			 unsigned *preferred_binding, unsigned npreferred)
			
 
				 {
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-	int current_preferred;
			
 
				-	int nhyperthreads = topology->nhwpus / topology->nhwcpus;
			
 
				+	unsigned current_preferred;
			
 
				+	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
			
 
				+	unsigned ncores = topology->nhwpus / nhyperthreads;
			
 
				 	unsigned i;
			
 
				 
			
 
				 	if (npreferred)
			
@@ -1049,32 +1068,42 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 
				 	     current_preferred < npreferred;
			
 
				 	     current_preferred++)
			
 
				 	{
			
 
				-		/* Try to get this core */
			
 
				+		/* can we bind the worker on the preferred core ? */
			
 
				 		unsigned requested_core = preferred_binding[current_preferred];
			
 
				 		unsigned requested_bindid = requested_core * nhyperthreads;
			
 
				 
			
 
				-		/* can we bind the worker on the preferred core ? */
			
 
				-		unsigned ind;
			
 
				 		/* Look at the remaining cores to be bound to */
			
 
				-		for (ind = 0;
			
 
				-		     ind < topology->nhwpus / nhyperthreads;
			
 
				-		     ind++)
			
 
				+		for (i = 0; i < ncores; i++)
			
 
				 		{
			
 
				-			if (topology->workers_bindid[ind] == requested_bindid && !config->currently_bound[ind])
			
 
				+			if (topology->workers_bindid[i] == requested_bindid &&
			
 
				+					(!config->currently_bound[i] ||
			
 
				+					 (config->currently_shared[i] && !(flags & STARPU_THREAD_ACTIVE)))
			
 
				+					)
			
 
				 			{
			
 
				-				/* the cpu is available, we use it ! */
			
 
				-				config->currently_bound[ind] = 1;
			
 
				+				/* the cpu is available, or shareable with us, we use it ! */
			
 
				+				config->currently_bound[i] = 1;
			
 
				+				if (!(flags & STARPU_THREAD_ACTIVE))
			
 
				+					config->currently_shared[i] = 1;
			
 
				 				return requested_bindid;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for (i = config->current_bindid; i < topology->nhwpus / nhyperthreads; i++)
			
 
				+	if (!(flags & STARPU_THREAD_ACTIVE))
			
 
				+	{
			
 
				+		/* Try to find a shareable PU */
			
 
				+		for (i = 0; i < ncores; i++)
			
 
				+			if (config->currently_shared[i])
			
 
				+				return topology->workers_bindid[i];
			
 
				+	}
			
 
				+
			
 
				+	/* Try to find an available PU from last used PU */
			
 
				+	for (i = config->current_bindid; i < ncores; i++)
			
 
				 		if (!config->currently_bound[i])
			
 
				 			/* Found a cpu ready for use, use it! */
			
 
				 			break;
			
 
				 
			
 
				-	if (i == topology->nhwpus / nhyperthreads)
			
 
				+	if (i == ncores)
			
 
				 	{
			
 
				 		/* Finished binding on all cpus, restart from start in
			
 
				 		 * case the user really wants overloading */
			
@@ -1082,13 +1111,20 @@ _starpu_get_next_bindid (struct _starpu_machine_config *config,
 
				 		i = 0;
			
 
				 	}
			
 
				 
			
 
				-	STARPU_ASSERT(i < topology->nhwpus / nhyperthreads);
			
 
				-	int bindid = topology->workers_bindid[i];
			
 
				+	STARPU_ASSERT(i < ncores);
			
 
				+	unsigned bindid = topology->workers_bindid[i];
			
 
				 	config->currently_bound[i] = 1;
			
 
				+	if (!(flags & STARPU_THREAD_ACTIVE))
			
 
				+		config->currently_shared[i] = 1;
			
 
				 	config->current_bindid = i;
			
 
				 	return bindid;
			
 
				 }
			
 
				 
			
 
				+unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned npreferred)
			
 
				+{
			
 
				+	return _starpu_get_next_bindid(_starpu_get_machine_config(), flags, preferred, npreferred);
			
 
				+}
			
 
				+
			
 
				 unsigned
			
 
				 _starpu_topology_get_nhwcpu (struct _starpu_machine_config *config)
			
 
				 {
			
@@ -1930,17 +1966,18 @@ void _starpu_destroy_machine_config(struct _starpu_machine_config *config)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-void
			
 
				+int
			
 
				 _starpu_bind_thread_on_cpu (
			
 
				-	int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED)
			
 
				+		int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, const char *name)
			
 
				 {
			
 
				+	int ret = 0;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	return;
			
 
				+	return ret;
			
 
				 #else
			
 
				 	if (nobind > 0)
			
 
				-		return;
			
 
				+		return ret;
			
 
				 	if (cpuid < 0)
			
 
				-		return;
			
 
				+		return ret;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	const struct hwloc_topology_support *support;
			
@@ -1956,11 +1993,41 @@ _starpu_bind_thread_on_cpu (
 
				 
			
 
				 	if (workerid != STARPU_NOWORKERID && cpuid < STARPU_MAXCPUS)
			
 
				 	{
			
 
				+/* TODO: mutex... */
			
 
				 		int previous = cpu_worker[cpuid];
			
 
				-		if (previous != STARPU_NOWORKERID && previous != workerid)
			
 
				-			_STARPU_DISP("Warning: both workers %d and %d are bound to the same PU %d, this will strongly degrade performance. Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", previous, workerid, cpuid, config->topology.nhwcpus, config->topology.nhwpus);
			
 
				+		/* We would like the PU to be available, or we are perhaps fine to share it */
			
 
				+		if ( !(  previous == STARPU_NOWORKERID ||
			
 
				+			(previous == STARPU_NONACTIVETHREAD && workerid == STARPU_NONACTIVETHREAD) ||
			
 
				+			(previous >= 0 && previous == workerid) ||
			
 
				+			(name && cpu_name[cpuid] && !strcmp(name, cpu_name[cpuid])) ) )
			
 
				+		{
			
 
				+			if (previous == STARPU_ACTIVETHREAD)
			
 
				+				_STARPU_DISP("Warning: active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
			
 
				+			else if (previous == STARPU_NONACTIVETHREAD)
			
 
				+				_STARPU_DISP("Warning: non-active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
			
 
				+			else
			
 
				+				_STARPU_DISP("Warning: worker %d was already bound to PU %d\n", previous, cpuid);
			
 
				+
			
 
				+			if (workerid == STARPU_ACTIVETHREAD)
			
 
				+				_STARPU_DISP("and we were told to also bind active thread %s to it.\n", name);
			
 
				+			else if (previous == STARPU_NONACTIVETHREAD)
			
 
				+				_STARPU_DISP("and we were told to also bind non-active thread %s to it.\n", name);
			
 
				+			else
			
 
				+				_STARPU_DISP("and we were told to also bind worker %d to it.\n", workerid);
			
 
				+
			
 
				+			_STARPU_DISP("This will strongly degrade performance.\n");
			
 
				+
			
 
				+			if (workerid >= 0)
			
 
				+				/* This shouldn't happen for workers */
			
 
				+				_STARPU_DISP("Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", config->topology.nhwcpus, config->topology.nhwpus);
			
 
				+			ret = -1;
			
 
				+		}
			
 
				 		else
			
 
				+		{
			
 
				 			cpu_worker[cpuid] = workerid;
			
 
				+			if (name)
			
 
				+				cpu_name[cpuid] = strdup(name);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	support = hwloc_topology_get_support (config->topology.hwtopology);
			
@@ -1970,12 +2037,12 @@ _starpu_bind_thread_on_cpu (
 
				 			hwloc_get_obj_by_depth (config->topology.hwtopology,
			
 
				 						config->pu_depth, cpuid);
			
 
				 		hwloc_bitmap_t set = obj->cpuset;
			
 
				-		int ret;
			
 
				+		int res;
			
 
				 
			
 
				 		hwloc_bitmap_singlify(set);
			
 
				-		ret = hwloc_set_cpubind (config->topology.hwtopology, set,
			
 
				+		res = hwloc_set_cpubind (config->topology.hwtopology, set,
			
 
				 					 HWLOC_CPUBIND_THREAD);
			
 
				-		if (ret)
			
 
				+		if (res)
			
 
				 		{
			
 
				 			perror("hwloc_set_cpubind");
			
 
				 			STARPU_ABORT();
			
@@ -2009,6 +2076,20 @@ _starpu_bind_thread_on_cpu (
 
				 #warning no CPU binding support
			
 
				 #endif
			
 
				 #endif
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+starpu_bind_thread_on(int cpuid, unsigned flags, const char *name)
			
 
				+{
			
 
				+	int workerid;
			
 
				+	STARPU_ASSERT_MSG(name, "starpu_bind_thread_on must be provided with a name");
			
 
				+	starpu_pthread_setname(name);
			
 
				+	if (flags & STARPU_THREAD_ACTIVE)
			
 
				+		workerid = STARPU_ACTIVETHREAD;
			
 
				+	else
			
 
				+		workerid = STARPU_NONACTIVETHREAD;
			
 
				+	return _starpu_bind_thread_on_cpu(cpuid, workerid, name);
			
 
				 }
			
 
				 
			
 
				 void
			
@@ -2065,7 +2146,7 @@ static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
 
				 			case STARPU_CPU_WORKER:
			
 
				 			{
			
 
				 				/* Dedicate a cpu core to that worker */
			
 
				-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
			
 
				+				workerarg->bindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				 				break;
			
 
				 			}
			
 
				 			default:
			
@@ -2485,8 +2566,8 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 		/* Perhaps the worker has some "favourite" bindings  */
			
 
				-		int *preferred_binding = NULL;
			
 
				-		int npreferred = 0;
			
 
				+		unsigned *preferred_binding = NULL;
			
 
				+		unsigned npreferred = 0;
			
 
				 #endif
			
 
				 
			
 
				 		/* select the memory node that contains worker's memory */
			
@@ -2530,7 +2611,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 					if (config->topology.cuda_th_per_stream == 0)
			
 
				 						workerarg->bindid = cuda_bindid[devid];
			
 
				 					else
			
 
				-						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+						workerarg->bindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
@@ -2538,11 +2619,11 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 					if (config->topology.cuda_th_per_dev == 0 && config->topology.cuda_th_per_stream == 0)
			
 
				 					{
			
 
				 						if (cuda_globalbindid == -1)
			
 
				-							cuda_globalbindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+							cuda_globalbindid = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 						workerarg->bindid = cuda_bindid[devid] = cuda_globalbindid;
			
 
				 					}
			
 
				 					else
			
 
				-						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
			
 
				 
			
 
				 					for (numa = 0; numa < nb_numa_nodes; numa++)
			
@@ -2639,7 +2720,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				else
			
 
				 				{
			
 
				 					opencl_init[devid] = 1;
			
 
				-					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
			
 
				 
			
 
				 					for (numa = 0; numa < nb_numa_nodes; numa++)
			
@@ -2684,7 +2765,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 						//	preferred_binding = _starpu_get_mic_affinity_vector(devid);
			
 
				 					//	npreferred = config->topology.nhwpus;
			
 
				 					//}
			
 
				-					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+					mic_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
			
 
				 
			
 
				 					for (numa = 0; numa < nb_numa_nodes; numa++)
			
@@ -2737,7 +2818,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				else
			
 
				 				{
			
 
				 					mpi_init[devid] = 1;
			
 
				-					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+					mpi_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
			
 
				 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
			
 
				 
			
 
				 					for (numa = 0; numa < nb_numa_nodes; numa++)
			
@@ -2951,6 +3032,8 @@ void _starpu_destroy_topology(struct _starpu_machine_config *config STARPU_ATTRI
 
				 	_starpu_memory_nodes_deinit();
			
 
				 
			
 
				 	_starpu_destroy_machine_config(config);
			
 
				+
			
 
				+	_starpu_deinitialize_workers_bindid(config);
			
 
				 }
			
 
				 
			
 
				 void
			
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -63,10 +63,12 @@ void _starpu_topology_filter(hwloc_topology_t topology);
 
				 #endif
			
 
				 
			
 
				 #define STARPU_NOWORKERID -1
			
 
				+#define STARPU_ACTIVETHREAD -2
			
 
				+#define STARPU_NONACTIVETHREAD -2
			
 
				 /* Bind the current thread on the CPU logically identified by "cpuid". The
			
 
				  * logical ordering of the processors is either that of hwloc (if available),
			
 
				  * or the ordering exposed by the OS. */
			
 
				-void _starpu_bind_thread_on_cpu(int cpuid, int workerid);
			
 
				+int _starpu_bind_thread_on_cpu(int cpuid, int workerid, const char *name);
			
 
				 
			
 
				 struct _starpu_combined_worker;
			
 
				 /* Bind the current thread on the set of CPUs for the given combined worker. */
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -657,7 +657,7 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 
				 	STARPU_PTHREAD_COND_SIGNAL(&worker->started_cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
			
 
				 
			
 
				-	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid);
			
 
				+	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid, NULL);
			
 
				 
			
 
				 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
			
 
				 	setitimer(ITIMER_PROF, &prof_itimer, NULL);
			
@@ -1471,7 +1471,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 
			
 
				 	int main_thread_cpuid = starpu_get_env_number_default("STARPU_MAIN_THREAD_CPUID", -1);
			
 
				 	if (main_thread_cpuid >= 0)
			
 
				-		_starpu_bind_thread_on_cpu(main_thread_cpuid, STARPU_NOWORKERID);
			
 
				+		_starpu_bind_thread_on_cpu(main_thread_cpuid, STARPU_NONACTIVETHREAD, "main");
			
 
				 
			
 
				 	_STARPU_DEBUG("Initialisation finished\n");
			
 
				 
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -366,6 +366,7 @@ struct _starpu_machine_config
 
				 	/* Where to bind next worker ? */
			
 
				 	int current_bindid;
			
 
				 	char currently_bound[STARPU_NMAXWORKERS];
			
 
				+	char currently_shared[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	/* Which GPU(s) do we use for CUDA ? */
			
 
				 	int current_cuda_gpuid;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -105,7 +105,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 		}
			
 
				 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
			
 
				 			/* rebind to single CPU */
			
 
				-			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
			
 
				+			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid, NULL);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -701,7 +701,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 	}
			
 
				 
			
 
				 	/* one more time to avoid hacks from third party lib :) */
			
 
				-	_starpu_bind_thread_on_cpu(worker0->bindid, worker0->workerid);
			
 
				+	_starpu_bind_thread_on_cpu(worker0->bindid, worker0->workerid, NULL);
			
 
				 
			
 
				 	for (i = 0; i < worker_set->nworkers; i++)
			
 
				 	{
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				- * Copyright (C) 2008-2015,2017                           Université de Bordeaux
			
 
				+ * Copyright (C) 2008-2015,2017-2018                           Université de Bordeaux
			
 
				  * Copyright (C) 2010-2011,2013,2015-2017                 CNRS
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  *
			
@@ -56,7 +56,7 @@ void *gordon_worker_progress(void *arg)
 
				 	struct _starpu_worker_set *gordon_set_arg = arg;
			
 
				 	unsigned prog_thread_bind_id =
			
 
				 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
			
 
				-	_starpu_bind_thread_on_cpu(prog_thread_bind_id, gordon_set_arg->workers[0].workerid);
			
 
				+	_starpu_bind_thread_on_cpu(prog_thread_bind_id, gordon_set_arg->workers[0].workerid, NULL);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 	progress_thread_is_inited = 1;
			
@@ -441,7 +441,7 @@ void *_starpu_gordon_worker(void *arg)
 
				 {
			
 
				 	struct _starpu_worker_set *gordon_set_arg = arg;
			
 
				 
			
 
				-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid);
			
 
				+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid, NULL);
			
 
				 
			
 
				 	/* TODO set_local_memory_node per SPU */
			
 
				 	gordon_init(gordon_set_arg->nworkers);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -629,7 +629,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 
				 	_starpu_opencl_init_context(devid);
			
 
				 
			
 
				 	/* one more time to avoid hacks from third party lib :) */
			
 
				-	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid);
			
 
				+	_starpu_bind_thread_on_cpu(worker->bindid, worker->workerid, NULL);
			
 
				 
			
 
				 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
			
 
				 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));