Selaa lähdekoodia

Warn when two workers are bound to the same cpu. Support ranges in STARPU_WORKERS_CPUID

Samuel Thibault 9 vuotta sitten
vanhempi
commit
877d29d194

+ 4 - 2
doc/doxygen/chapters/40environment_variables.doxy

@@ -134,13 +134,15 @@ CPUs. This is for instance useful when running the testsuite in parallel.
 <dd>
 \anchor STARPU_WORKERS_CPUID
 \addindex __env__STARPU_WORKERS_CPUID
-Passing an array of integers (starting from 0) in \ref STARPU_WORKERS_CPUID
+Passing an array of integers in \ref STARPU_WORKERS_CPUID
 specifies on which logical CPU the different workers should be
 bound. For instance, if <c>STARPU_WORKERS_CPUID = "0 1 4 5"</c>, the first
 worker will be bound to logical CPU #0, the second CPU worker will be bound to
 logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
 determined by the OS, or provided by the library <c>hwloc</c> in case it is
-available.
+available. Ranges can be provided: for instance, <c>STARPU_WORKERS_CPUID = "1:3
+5"</c> will bind the first three workers on logical CPUs #1, #2, and #3, and the
+fourth worker on logical CPU #5.
 
 Note that the first workers correspond to the CUDA workers, then come the
 OpenCL workers, and finally the CPU workers. For example if

+ 11 - 11
src/core/perfmodel/perfmodel_bus.c

@@ -130,7 +130,7 @@ static hwloc_topology_t hwtopology;
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 	size_t size = SIZE;
 
 	/* Initialize CUDA context on the device */
@@ -139,13 +139,13 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	cudaSetDevice(dev);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	/* hack to force the initialization */
 	cudaFree(0);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
         /* Get the maximum size which can be allocated on the device */
 	struct cudaDeviceProp prop;
@@ -161,7 +161,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	STARPU_ASSERT(cures == cudaSuccess);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
@@ -169,14 +169,14 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	STARPU_ASSERT(cures == cudaSuccess);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	/* Fill them */
 	memset(h_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	unsigned iter;
 	double timing;
@@ -348,7 +348,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	int not_initialized;
 
         struct _starpu_machine_config *config = _starpu_get_machine_config();
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	/* Is the context already initialised ? */
         starpu_opencl_get_context(dev, &context);
@@ -380,7 +380,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	}
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
 	/* Allocate a buffer on the device */
 	cl_mem d_buffer;
@@ -388,21 +388,21 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
         /* Allocate a buffer on the host */
 	unsigned char *h_buffer;
         h_buffer = (unsigned char *)malloc(size);
 	STARPU_ASSERT(h_buffer);
 
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
         /* Fill them */
 	memset(h_buffer, 0, size);
         err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
         if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
         clFinish(queue);
 	/* hack to avoid third party libs to rebind threads */
-	_starpu_bind_thread_on_cpu(config, cpu);
+	_starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
 
         unsigned iter;
 	double timing;

+ 1 - 1
src/core/sched_ctx.c

@@ -1896,7 +1896,7 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 
 void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid)
 {
-	_starpu_bind_thread_on_cpu(_starpu_get_machine_config(), cpuid);
+	_starpu_bind_thread_on_cpu(_starpu_get_machine_config(), cpuid, STARPU_NOWORKERID);
 }
 
 unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)

+ 33 - 1
src/core/topology.c

@@ -50,6 +50,9 @@
 static unsigned topology_is_initialized = 0;
 static int nobind;
 
+/* For checking whether two workers share the same PU, indexed by PU number */
+static int cpu_worker[STARPU_MAXCPUS];
+
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 
 struct handle_entry
@@ -590,6 +593,22 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 					topology->workers_bindid[i] =
 						(unsigned)(val % topology->nhwpus);
 					strval = endptr;
+					if (*strval == ':')
+					{
+						/* range of values */
+						long int endval;
+						strval++;
+						endval = strtol(strval, &endptr, 10);
+						strval = endptr;
+						for (val++; val <= endval && i < STARPU_NMAXWORKERS-1; val++)
+						{
+							i++;
+							topology->workers_bindid[i] =
+								(unsigned)(val % topology->nhwpus);
+						}
+					}
+					if (*strval == ',')
+						strval++;
 				}
 				else
 				{
@@ -648,6 +667,9 @@ _starpu_initialize_workers_bindid (struct _starpu_machine_config *config)
 			i++;
 		}
 	}
+
+	for (i = 0; i < STARPU_MAXCPUS;i++)
+		cpu_worker[i] = STARPU_NOWORKERID;
 }
 
 /* This function gets the identifier of the next core on which to bind a
@@ -1260,7 +1282,7 @@ void _starpu_destroy_machine_config(struct _starpu_machine_config *config)
 void
 _starpu_bind_thread_on_cpu (
 	struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED,
-	int cpuid STARPU_ATTRIBUTE_UNUSED)
+	int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SIMGRID
 	return;
@@ -1269,6 +1291,16 @@ _starpu_bind_thread_on_cpu (
 		return;
 	if (cpuid < 0)
 		return;
+
+	if (workerid != STARPU_NOWORKERID && cpuid < STARPU_MAXCPUS)
+	{
+		int previous = cpu_worker[cpuid];
+		if (previous != STARPU_NOWORKERID && previous != workerid)
+			_STARPU_DISP("Warning: both workers %d and %d are bound to the same PU %d, this will strongly degrade performance\n", previous, workerid, cpuid);
+		else
+			cpu_worker[cpuid] = workerid;
+	}
+
 #ifdef STARPU_HAVE_HWLOC
 	const struct hwloc_topology_support *support;
 

+ 2 - 1
src/core/topology.h

@@ -41,10 +41,11 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 /* returns the number of logical cpus */
 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 
+#define STARPU_NOWORKERID -1
 /* Bind the current thread on the CPU logically identified by "cpuid". The
  * logical ordering of the processors is either that of hwloc (if available),
  * or the ordering exposed by the OS. */
-void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config, int cpuid);
+void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config, int cpuid, int workerid);
 
 struct _starpu_combined_worker;
 /* Bind the current thread on the set of CPUs for the given combined worker. */

+ 1 - 1
src/core/workers.c

@@ -600,7 +600,7 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 	_starpu_worker_start(worker, fut_key, sync);
 #endif
 
-	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
 
         _STARPU_DEBUG("worker %p %d for dev %d is ready on logical cpu %d\n", worker, worker->workerid, devid, worker->bindid);
 #ifdef STARPU_HAVE_HWLOC

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -112,7 +112,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		}
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */
-			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid);
+			_starpu_bind_thread_on_cpu(cpu_args->config, cpu_args->bindid, cpu_args->workerid);
 	}
 
 	_starpu_driver_end_job(cpu_args, j, perf_arch, &codelet_end, rank, profiling);

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -605,7 +605,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 	}
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
+	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid, worker0->workerid);
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	{

+ 3 - 3
src/drivers/gordon/driver_gordon.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2014  Université de Bordeaux
+ * Copyright (C) 2009-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -55,7 +55,7 @@ void *gordon_worker_progress(void *arg)
 	struct _starpu_worker_set *gordon_set_arg = arg;
 	unsigned prog_thread_bind_id =
 		(gordon_set_arg->workers[0].bindid + 1)%(gordon_set_arg->config->nhwcores);
-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id);
+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, prog_thread_bind_id, gordon_set_arg->workers[0].workerid);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	progress_thread_is_inited = 1;
@@ -438,7 +438,7 @@ void *_starpu_gordon_worker(void *arg)
 {
 	struct _starpu_worker_set *gordon_set_arg = arg;
 
-	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid);
+	_starpu_bind_thread_on_cpu(gordon_set_arg->config, gordon_set_arg->workers[0].bindid, gordon_set_arg->workers[0].workerid);
 
 	/* TODO set_local_memory_node per SPU */
 	gordon_init(gordon_set_arg->nworkers);

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -612,7 +612,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	_starpu_opencl_init_context(devid);
 
 	/* one more time to avoid hacks from third party lib :) */
-	_starpu_bind_thread_on_cpu(worker->config, worker->bindid);
+	_starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
 
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));