Browse Source

Warn when two workers are bound to the same cpu. Support ranges in STARPU_WORKERS_CPUID

Samuel Thibault 9 years ago
parent
commit
877d29d194

+ 4 - 2
doc/doxygen/chapters/40environment_variables.doxy

@@ -134,13 +134,15 @@ CPUs. This is for instance useful when running the testsuite in parallel.
 <dd>
 <dd>
 \anchor STARPU_WORKERS_CPUID
 \anchor STARPU_WORKERS_CPUID
 \addindex __env__STARPU_WORKERS_CPUID
 \addindex __env__STARPU_WORKERS_CPUID
-Passing an array of integers (starting from 0) in \ref STARPU_WORKERS_CPUID
+Passing an array of integers in \ref STARPU_WORKERS_CPUID
 specifies on which logical CPU the different workers should be
 specifies on which logical CPU the different workers should be
 bound. For instance, if <c>STARPU_WORKERS_CPUID = "0 1 4 5"</c>, the first
 bound. For instance, if <c>STARPU_WORKERS_CPUID = "0 1 4 5"</c>, the first
 worker will be bound to logical CPU #0, the second CPU worker will be bound to
 worker will be bound to logical CPU #0, the second CPU worker will be bound to
 logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
 logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
 determined by the OS, or provided by the library <c>hwloc</c> in case it is
 determined by the OS, or provided by the library <c>hwloc</c> in case it is
-available.
+available. Ranges can be provided: for instance, <c>STARPU_WORKERS_CPUID = "1:3
+5"</c> will bind the first three workers on logical CPUs #1, #2, and #3, and the
+fourth worker on logical CPU #5.
 
 
 Note that the first workers correspond to the CUDA workers, then come the
 Note that the first workers correspond to the CUDA workers, then come the
 OpenCL workers, and finally the CPU workers. For example if
 OpenCL workers, and finally the CPU workers. For example if

+ 11 - 11
src/core/perfmodel/perfmodel_bus.c

@@ -130,7 +130,7 @@ static hwloc_topology_t hwtopology;
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 	size_t size = SIZE;
 	size_t size = SIZE;
 
 
 	/* Initialize CUDA context on the device */
 	/* Initialize CUDA context on the device */
@@ -139,13 +139,13 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	cudaSetDevice(dev);
 	cudaSetDevice(dev);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	/* hack to force the initialization */
 	/* hack to force the initialization */
 	cudaFree(0);
 	cudaFree(0);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
         /* Get the maximum size which can be allocated on the device */
         /* Get the maximum size which can be allocated on the device */
 	struct cudaDeviceProp prop;
 	struct cudaDeviceProp prop;
@@ -161,7 +161,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	STARPU_ASSERT(cures == cudaSuccess);
 	STARPU_ASSERT(cures == cudaSuccess);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	/* Allocate a buffer on the host */
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 	unsigned char *h_buffer;
@@ -169,14 +169,14 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	STARPU_ASSERT(cures == cudaSuccess);
 	STARPU_ASSERT(cures == cudaSuccess);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	/* Fill them */
 	/* Fill them */
 	memset(h_buffer, 0, size);
 	memset(h_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	unsigned iter;
 	unsigned iter;
 	double timing;
 	double timing;
@@ -348,7 +348,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	int not_initialized;
 	int not_initialized;
 
 
         struct _starpu_machine_config *config = _starpu_get_machine_config();
         struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	/* Is the context already initialised ? */
 	/* Is the context already initialised ? */
         starpu_opencl_get_context(dev, &context);
         starpu_opencl_get_context(dev, &context);
@@ -380,7 +380,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	}
 	}
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	cl_mem d_buffer;
 	cl_mem d_buffer;
@@ -388,21 +388,21 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
         /* Allocate a buffer on the host */
         /* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 	unsigned char *h_buffer;
         h_buffer = (unsigned char *)malloc(size);
         h_buffer = (unsigned char *)malloc(size);
 	STARPU_ASSERT(h_buffer);
 	STARPU_ASSERT(h_buffer);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
         /* Fill them */
         /* Fill them */
 	memset(h_buffer, 0, size);
 	memset(h_buffer, 0, size);
         err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
         err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
         if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
         if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
         clFinish(queue);
         clFinish(queue);
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 
         unsigned iter;
         unsigned iter;
 	double timing;
 	double timing;

+ 1 - 1
src/core/sched_ctx.c

@@ -1896,7 +1896,7 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 
 
 void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
 void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
 {
 {
-	_starpu_bind_thread_on_cpu(_starpu_get_machine_config(), cpuid);
+	_starpu_bind_thread_on_cpu(_starpu_get_machine_config(), cpuid, STARPU_NOWORKERID);
 }
 }
 
 
 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)

+ 33 - 1
src/core/topology.c

@@ -50,6 +50,9 @@
 static unsigned topology_is_initialized = 0;
 static unsigned topology_is_initialized = 0;
 static int nobind;
 static int nobind;
 
 
+/* For checking whether two workers share the same PU, indexed by PU number */
+static int cpu_worker[STARPU_MAXCPUS];
+
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 
 
 struct handle_entry
 struct handle_entry
@@ -590,6 +593,22 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 					topology->workers_bindid[i] =
 					topology->workers_bindid[i] =
 						(unsigned)(val % topology->nhwpus);
 						(unsigned)(val % topology->nhwpus);
 					strval = endptr;
 					strval = endptr;
+					if (*strval == ':')
+					{
+						/* range of values */
+						long int endval;
+						strval++;
+						endval = strtol(strval, &endptr, 10);
+						strval = endptr;
+						for (val++; val <= endval && i < STARPU_NMAXWORKERS-1; val++)
+						{
+							i++;
+							topology->workers_bindid[i] =
+								(unsigned)(val % topology->nhwpus);
+						}
+					}
+					if (*strval == ',')
+						strval++;
 				}
 				}
 				else
 				else
 				{
 				{
@@ -648,6 +667,9 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 			i++;
 			i++;
 		}
 		}
 	}
 	}
+
+	for (i = 0; i < STARPU_MAXCPUS;i++)
+		cpu_worker[i] = STARPU_NOWORKERID;
 }
 }
 
 
 /* This function gets the identifier of the next core on which to bind a
 /* This function gets the identifier of the next core on which to bind a
@@ -1260,7 +1282,7 @@ void _starpu_destroy_machine_config(struct _starpu_machine_config *config)
 void
 void
 _starpu_bind_thread_on_cpu (
 _starpu_bind_thread_on_cpu (
 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
-	int cpuid STARPU_ATTRIBUTE_UNUSED)
+	int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	return;
 	return;
@@ -1269,6 +1291,16 @@ _starpu_bind_thread_on_cpu (
 		return;
 		return;
 	if (cpuid < 0)
 	if (cpuid < 0)
 		return;
 		return;
+
+	if (workerid != STARPU_NOWORKERID && cpuid < STARPU_MAXCPUS)
+	{
+		int previous = cpu_worker[cpuid];
+		if (previous != STARPU_NOWORKERID && previous != workerid)
+			_STARPU_DISP("Warning: both workers %d and %d are bound to the same PU %d, this will strongly degrade performance\n", previous, workerid, cpuid);
+		else
+			cpu_worker[cpuid] = workerid;
+	}
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	const struct hwloc_topology_support *support;
 	const struct hwloc_topology_support *support;
 
 

+ 2 - 1
src/core/topology.h

@@ -41,10 +41,11 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 /* returns the number of logical cpus */
 /* returns the number of logical cpus */
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 
 
+#define STARPU_NOWORKERID -1
 /* Bind the current thread on the CPU logically identified by "cpuid". The
 /* Bind the current thread on the CPU logically identified by "cpuid". The
  * logical ordering of the processors is either that of hwloc (if available),
  * logical ordering of the processors is either that of hwloc (if available),
  * or the ordering exposed by the OS. */
  * or the ordering exposed by the OS. */
-void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config, int cpuid);
+void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config, int cpuid, int workerid);
 
 
 struct _starpu_combined_worker;
 struct _starpu_combined_worker;
 /* Bind the current thread on the set of CPUs for the given combined worker. */
 /* Bind the current thread on the set of CPUs for the given combined worker. */

+ 1 - 1
src/core/workers.c

@@ -600,7 +600,7 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 	_starpu_worker_start(worker, fut_key, sync);
 	_starpu_worker_start(worker, fut_key, sync);
 #endif
 #endif
 
 
-	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
 
 
         _STARPU_DEBUG("worker %p %d for dev %d is ready on logical cpu %d\n", worker, worker->workerid, devid, worker->bindid);
         _STARPU_DEBUG("worker %p %d for dev %d is ready on logical cpu %d\n", worker, worker->workerid, devid, worker->bindid);
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -112,7 +112,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		}
 		}
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */
 			/* rebind to single CPU */
-			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
+			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid, cpu_args->workerid);
 	}
 	}
 
 
 	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank, profiling);
 	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank, profiling);

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -605,7 +605,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 	}
 	}
 
 
 	/* one more time to avoid hacks from third party lib :) */
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
+	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid, worker0->workerid);
 
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	for (i = 0; i < worker_set->nworkers; i++)
 	{
 	{

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2014  Université de Bordeaux
+ * Copyright (C) 2009-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013  CNRS
  * Copyright (C) 2010, 2011, 2013  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
@@ -55,7 +55,7 @@ void *gordon_worker_progress(void *arg)
 	struct _starpu_worker_set *gordon_set_arg = arg;
 	struct _starpu_worker_set *gordon_set_arg = arg;
 	unsigned prog_thread_bind_id =
 	unsigned prog_thread_bind_id =
 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id);
+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id, gordon_set_arg->workers[0].workerid);
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	progress_thread_is_inited = 1;
 	progress_thread_is_inited = 1;
@@ -438,7 +438,7 @@ void *_starpu_gordon_worker(void *arg)
 {
 {
 	struct _starpu_worker_set *gordon_set_arg = arg;
 	struct _starpu_worker_set *gordon_set_arg = arg;
 
 
-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid);
+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid);
 
 
 	/* TODO set_local_memory_node per SPU */
 	/* TODO set_local_memory_node per SPU */
 	gordon_init(gordon_set_arg->nworkers);
 	gordon_init(gordon_set_arg->nworkers);

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -612,7 +612,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	_starpu_opencl_init_context(devid);
 	_starpu_opencl_init_context(devid);
 
 
 	/* one more time to avoid hacks from third party lib :) */
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
 
 
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));