Browse Source

Add STARPU_CUDA_THREAD_PER_DEV environment variable to support driving all
GPUs from only one thread when almost all kernels are asynchronous.

Samuel Thibault 8 years ago
parent
commit
7cdeadd447

+ 2 - 0
ChangeLog

@@ -25,6 +25,8 @@ New features:
   * Add support for multiple linear regression performance models
   * Add MPI Master-Slave support to use the cores of remote nodes. Use the
     --enable-mpi-master-slave option to activate it.
+  * Add STARPU_CUDA_THREAD_PER_DEV environment variable to support driving all
+    GPUs from only one thread when almost all kernels are asynchronous.
 
 Small features:
   * Scheduling contexts may now be associated a user data pointer at creation

+ 8 - 1
doc/doxygen/chapters/390_faq.doxy

@@ -235,12 +235,19 @@ a task is finished, to feed the GPU with another task (StarPU actually submits
 a couple of tasks in advance so as to pipeline this, but filling the pipeline
 still has to be happening often enough), and thus it has to dedicate threads for
 this, and this is a very CPU-consuming duty. StarPU thus dedicates one CPU core
-for driving each GPU.
+for driving each GPU by default.
 
 Such dedication is also useful when a codelet is hybrid, i.e. while kernels are
 running on the GPU, the codelet can run some computation, which thus be run by
 the CPU core instead of driving the GPU.
 
+One can choose to dedicate only one thread for all the CUDA devices by setting
+the STARPU_CUDA_THREAD_PER_DEV environment variable to 1. The application
+however should use STARPU_CUDA_ASYNC on its CUDA codelets (asynchronous
+execution), otherwise the execution of a synchronous CUDA codelet will
+monopolize the thread, and other CUDA devices will thus starve while it is
+executing.
+
 \section CUDADrivers StarPU does not see my CUDA device
 
 First make sure that CUDA is properly running outside StarPU: build and

+ 11 - 1
doc/doxygen/chapters/501_environment_variables.doxy

@@ -58,7 +58,17 @@ which will be concurrently running on the devices. The default value is 1.
 \addindex __env__STARPU_CUDA_THREAD_PER_WORKER
 Specify if the cuda driver should provide a thread per stream or a single thread 
 dealing with all the streams. 0 if one thread per stream, 1 otherwise. The default 
-value is 0.
+value is 0. Setting it to 1 is contradictory with setting STARPU_CUDA_THREAD_PER_DEV to 1.
+</dd>
+
+<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
+<dd>
+\anchor STARPU_CUDA_THREAD_PER_DEV
+\addindex __env__STARPU_CUDA_THREAD_PER_DEV
+Specify if the cuda driver should provide a thread per device or a single thread 
+dealing with all the devices. 0 if one thread per device, 1 otherwise. The default 
+value is 1, unless STARPU_CUDA_THREAD_PER_WORKER is set to 1. Setting it to 1 is
+contradictory with setting STARPU_CUDA_THREAD_PER_WORKER to 1.
 </dd>
 
 <dt>STARPU_CUDA_PIPELINE</dt>

+ 54 - 8
src/core/topology.c

@@ -1220,15 +1220,49 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 	_starpu_initialize_workers_cuda_gpuid(config);
 
 	/* allow having one worker per stream */
-	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
+	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", -1);
+	topology->cuda_th_per_dev = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_DEV", -1);
+
+	/* per device by default */
+	if (topology->cuda_th_per_dev == -1)
+	{
+		if (topology->cuda_th_per_stream == 1)
+			topology->cuda_th_per_dev = 0;
+		else
+			topology->cuda_th_per_dev = 1;
+	}
+	/* Not per stream by default */
+	if (topology->cuda_th_per_stream == -1)
+	{
+		topology->cuda_th_per_stream = 0;
+	}
+
+	STARPU_ASSERT_MSG(topology->cuda_th_per_dev != 1 || topology->cuda_th_per_stream != 1, "It does not make sense to set both STARPU_CUDA_THREAD_PER_WORKER and STARPU_CUDA_THREAD_PER_DEV to 1, please choose either per worker or per device or none");
+
+	if (!topology->cuda_th_per_dev)
+	{
+		cuda_worker_set[0].workers = &config->workers[topology->nworkers];
+		cuda_worker_set[0].nworkers = topology->ncudagpus * nworker_per_cuda;
+	}
 
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
 	{
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
-		cuda_worker_set[devid].workers = &config->workers[worker_idx0];
-		cuda_worker_set[devid].nworkers = nworker_per_cuda;
+		struct _starpu_worker_set *worker_set;
+
+		if (topology->cuda_th_per_dev)
+		{
+			worker_set = &cuda_worker_set[devid];
+			worker_set->workers = &config->workers[worker_idx0];
+			worker_set->nworkers = nworker_per_cuda;
+		}
+		else
+		{
+			/* Same worker set for all devices */
+			worker_set = &cuda_worker_set[0];
+		}
 
 		for (i = 0; i < nworker_per_cuda; i++)
 		{
@@ -1241,7 +1275,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 				config->workers[worker_idx].set->nworkers = 1;
 			}
 			else
-				config->workers[worker_idx].set = &cuda_worker_set[devid];
+				config->workers[worker_idx].set = worker_set;
 
 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 			_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
@@ -1453,9 +1487,13 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 #endif
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 			unsigned cuda_busy_cpus = 0;
-#if defined(STARPU_USE_CUDA)
-			cuda_busy_cpus = topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
-				topology->ncudagpus;
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+			cuda_busy_cpus =
+				topology->cuda_th_per_dev == 0 && topology->cuda_th_per_stream == 0 ?
+					(topology->ncudagpus ? 1 : 0) :
+				topology->cuda_th_per_stream ?
+					(nworker_per_cuda * topology->ncudagpus) :
+					topology->ncudagpus;
 #endif
 			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus 
 				+ cuda_busy_cpus
@@ -1731,6 +1769,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
+	int cuda_globalbindid = -1;
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
@@ -1821,7 +1860,14 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				else
 				{
 					cuda_init[devid] = 1;
-					workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+					if (config->topology.cuda_th_per_dev == 0 && config->topology.cuda_th_per_stream == 0)
+					{
+						if (cuda_globalbindid == -1)
+							cuda_globalbindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+						workerarg->bindid = cuda_bindid[devid] = cuda_globalbindid;
+					}
+					else
+						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 
 					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);

+ 1 - 0
src/core/workers.h

@@ -244,6 +244,7 @@ struct _starpu_machine_topology
 	unsigned ncudagpus;
 	unsigned nworkerpercuda;
 	unsigned cuda_th_per_stream;
+	unsigned cuda_th_per_dev;
 
 	/* Actual number of OpenCL workers used by StarPU. */
 	unsigned nopenclgpus;

+ 25 - 16
src/drivers/cuda/driver_cuda.c

@@ -381,7 +381,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 	_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
 }
 
-static void init_worker_context(unsigned workerid)
+static void init_worker_context(unsigned workerid, unsigned devid)
 {
 	int j;
 #ifdef STARPU_SIMGRID
@@ -389,6 +389,7 @@ static void init_worker_context(unsigned workerid)
 		task_finished[workerid][j] = 0;
 #else /* !STARPU_SIMGRID */
 	cudaError_t cures;
+	starpu_cuda_set_device(devid);
 
 	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
 	{
@@ -408,6 +409,7 @@ static void init_worker_context(unsigned workerid)
 static void deinit_device_context(unsigned devid)
 {
 	int i;
+	starpu_cuda_set_device(devid);
 
 	cudaStreamDestroy(in_transfer_streams[devid]);
 	cudaStreamDestroy(out_transfer_streams[devid]);
@@ -420,13 +422,14 @@ static void deinit_device_context(unsigned devid)
 }
 #endif /* !STARPU_SIMGRID */
 
-static void deinit_worker_context(unsigned workerid)
+static void deinit_worker_context(unsigned workerid, unsigned devid)
 {
 	unsigned j;
 #ifdef STARPU_SIMGRID
 	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
 		task_finished[workerid][j] = 0;
 #else /* STARPU_SIMGRID */
+	starpu_cuda_set_device(devid);
 	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
 		cudaEventDestroy(task_events[workerid][j]);
 	cudaStreamDestroy(streams[workerid]);
@@ -641,17 +644,20 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		unsigned devid = worker->devid;
 		unsigned memnode = worker->memory_node;
 		if ((int) devid == lastdevid)
+		{
+#ifdef STARPU_SIMGRID
+			STARPU_ASSERT_MSG(0, "Simgrid mode does not support concurrent kernel execution yet\n");
+#else /* !STARPU_SIMGRID */
+			if (props[devid].concurrentKernels == 0)
+				_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
+#endif /* !STARPU_SIMGRID */
+
 			/* Already initialized */
 			continue;
+		}
 		lastdevid = devid;
 		init_device_context(devid, memnode);
 
-#ifdef STARPU_SIMGRID
-		STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
-#else /* !STARPU_SIMGRID */
-		if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
-			_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
-#endif /* !STARPU_SIMGRID */
 	}
 
 	/* one more time to avoid hacks from third party lib :) */
@@ -662,6 +668,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		struct _starpu_worker *worker = &worker_set->workers[i];
 		unsigned devid = worker->devid;
 		unsigned workerid = worker->workerid;
+		unsigned subdev = i % _starpu_get_machine_config()->topology.nworkerpercuda;
 
 		float size = (float) global_mem[devid] / (1<<30);
 #ifdef STARPU_SIMGRID
@@ -676,15 +683,15 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
 		if (props[devid].pciDomainID)
-			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, i, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
 		else
 #endif
-			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, i, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
+			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, subdev, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-		snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, i, devname, size);
+		snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, subdev, devname, size);
 #endif
-		snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, i);
-		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
+		snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, subdev);
+		_STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, subdev, worker->bindid);
 
 		worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
 		if (worker->pipeline_length > STARPU_MAX_PIPELINE)
@@ -701,7 +708,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 			worker->pipeline_length = 0;
 		}
 #endif
-		init_worker_context(workerid);
+		init_worker_context(workerid, worker->devid);
 
 		_STARPU_TRACE_WORKER_INIT_END(workerid);
 	}
@@ -767,6 +774,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		{
 			j = _starpu_get_job_associated_to_task(task);
 
+			_starpu_set_local_worker_key(worker);
 			_starpu_release_fetch_task_input_async(j, worker);
 			/* Reset it */
 			worker->task_transferring = NULL;
@@ -952,9 +960,10 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	{
-		unsigned workerid = worker_set->workers[i].workerid;
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		unsigned workerid = worker->workerid;
 
-		deinit_worker_context(workerid);
+		deinit_worker_context(workerid, worker->devid);
 	}
 
 	worker_set->workers[0].worker_is_initialized = 0;

+ 7 - 5
src/drivers/cuda/starpu_cublas.c

@@ -27,12 +27,16 @@ static int cublas_initialized[STARPU_NMAXWORKERS];
 
 static unsigned get_idx(void) {
 	unsigned workerid = starpu_worker_get_id_check();
+	unsigned th_per_dev = _starpu_get_machine_config()->topology.cuda_th_per_dev;
 	unsigned th_per_stream = _starpu_get_machine_config()->topology.cuda_th_per_stream;
 
-	if (th_per_stream)
+	if (th_per_dev)
+		return starpu_worker_get_devid(workerid);
+	else if (th_per_stream)
 		return workerid;
 	else
-		return starpu_worker_get_devid(workerid);
+		/* same thread for all devices */
+		return 0;
 }
 
 static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
@@ -76,10 +80,8 @@ void starpu_cublas_shutdown(void)
 
 void starpu_cublas_set_stream(void)
 {
-#ifdef STARPU_USE_CUDA
-	if (
+	if (!_starpu_get_machine_config()->topology.cuda_th_per_dev ||
 		(!_starpu_get_machine_config()->topology.cuda_th_per_stream &&
 		 _starpu_get_machine_config()->topology.nworkerpercuda > 1))
 		cublasSetKernelStream(starpu_cuda_get_local_stream());
-#endif
 }