Explorar o código

Make CUDA driver initialization able to cope with several card

Samuel Thibault %!s(int64=10) %!d(string=hai) anos
pai
achega
ea38c92f6f
Modificáronse 3 ficheiros con 128 adicións e 75 borrados
  1. 35 17
      src/core/workers.c
  2. 12 3
      src/core/workers.h
  3. 81 55
      src/drivers/cuda/driver_cuda.c

+ 35 - 17
src/core/workers.c

@@ -54,6 +54,7 @@ static int init_count = 0;
 static enum { UNINITIALIZED, CHANGING, INITIALIZED } initialized = UNINITIALIZED;
 
 static starpu_pthread_key_t worker_key;
+static starpu_pthread_key_t worker_set_key;
 
 static struct _starpu_machine_config config;
 
@@ -626,6 +627,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 		unsigned devid = workerarg->devid;
 #endif
+#ifdef STARPU_USE_CUDA
+		struct _starpu_worker_set *worker_set;
+#endif
 
 		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
 
@@ -668,24 +672,27 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = workerarg->devid;
-				workerarg->set = &cuda_worker_set[devid];
+				worker_set = &cuda_worker_set[devid];
+				workerarg->set = worker_set;
 
-				/* We spawn only one thread per CUDA device,
+				/* We spawn only one thread per CUDA driver,
 				 * which will control all CUDA workers of this
-				 * device. (by using a worker set). */
-				if (cuda_worker_set[devid].workers)
+				 * driver. (by using a worker set). */
+				if (worker_set->workers)
 					break;
 
-				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+				worker_set->nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+
 #ifndef STARPU_NON_BLOCKING_DRIVERS
-				if (cuda_worker_set[devid].nworkers > 1)
+				if (worker_set->nworkers > 1)
 				{
 					_STARPU_DISP("Warning: reducing STARPU_NWORKER_PER_CUDA to 1 because blocking drivers are enabled\n");
-					cuda_worker_set[devid].nworkers = 1;
+					worker_set->nworkers = 1;
 				}
 #endif
-				cuda_worker_set[devid].workers = workerarg;
-				cuda_worker_set[devid].set_is_initialized = 0;
+
+				worker_set->workers = workerarg;
+				worker_set->set_is_initialized = 0;
 
 				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
@@ -695,10 +702,10 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 				STARPU_PTHREAD_CREATE_ON(
 					workerarg->name,
-					&cuda_worker_set[devid].worker_thread,
+					&worker_set->worker_thread,
 					NULL,
 					_starpu_cuda_worker,
-					&cuda_worker_set[devid],
+					worker_set,
 					_starpu_simgrid_get_host_by_worker(workerarg));
 #ifdef STARPU_USE_FXT
 				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
@@ -706,12 +713,12 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
-				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
-				while (!cuda_worker_set[devid].set_is_initialized)
-					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
-								 &cuda_worker_set[devid].mutex);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
-				cuda_worker_set[devid].started = 1;
+				STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+				while (!worker_set->set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
+								 &worker_set->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
+				worker_set->started = 1;
 
 				break;
 #endif
@@ -872,6 +879,16 @@ struct _starpu_worker *_starpu_get_local_worker_key(void)
 	return (struct _starpu_worker *) STARPU_PTHREAD_GETSPECIFIC(worker_key);
 }
 
+void _starpu_set_local_worker_set_key(struct _starpu_worker_set *worker)
+{
+	STARPU_PTHREAD_SETSPECIFIC(worker_set_key, worker);
+}
+
+struct _starpu_worker_set *_starpu_get_local_worker_set_key(void)
+{
+	return (struct _starpu_worker_set *) STARPU_PTHREAD_GETSPECIFIC(worker_set_key);
+}
+
 /* Initialize the starpu_conf with default values */
 int starpu_conf_init(struct starpu_conf *conf)
 {
@@ -1198,6 +1215,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		_starpu_worker_init(&config.workers[worker], &config);
 
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&worker_set_key, NULL);
 
 	_starpu_build_tree();
 

+ 12 - 3
src/core/workers.h

@@ -384,18 +384,27 @@ unsigned _starpu_worker_can_block(unsigned memnode, struct _starpu_worker *worke
  * */
 void _starpu_block_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 
+/* This function initializes the current thread for the given worker */
+void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key);
+
 /* The _starpu_worker structure describes all the state of a StarPU worker.
  * This function sets the pthread key which stores a pointer to this structure.
  * */
 void _starpu_set_local_worker_key(struct _starpu_worker *worker);
 
-/* This function initializes the current thread for the given worker */
-void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key);
-
 /* Returns the _starpu_worker structure that describes the state of the
  * current worker. */
 struct _starpu_worker *_starpu_get_local_worker_key(void);
 
+/* The _starpu_worker_set structure describes all the state of a StarPU worker_set.
+ * This function sets the pthread key which stores a pointer to this structure.
+ * */
+void _starpu_set_local_worker_set_key(struct _starpu_worker_set *worker_set);
+
+/* Returns the _starpu_worker_set structure that describes the state of the
+ * current worker_set. */
+struct _starpu_worker_set *_starpu_get_local_worker_set_key(void);
+
 /* Returns the _starpu_worker structure that describes the state of the
  * specified worker. */
 struct _starpu_worker *_starpu_get_worker_struct(unsigned id);

+ 81 - 55
src/drivers/cuda/driver_cuda.c

@@ -327,34 +327,10 @@ static void init_worker_context(unsigned workerid)
 #endif /* !STARPU_SIMGRID */
 }
 
-static void deinit_context(struct _starpu_worker_set *worker_set)
+#ifndef STARPU_SIMGRID
+static void deinit_device_context(unsigned devid)
 {
-	unsigned i, j;
-	int workerid;
-
-#ifdef STARPU_SIMGRID
-	for (i = 0; i < worker_set->nworkers; i++)
-	{
-		workerid = worker_set->workers[i].workerid;
-		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
-		{
-			STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[workerid][j]);
-			STARPU_PTHREAD_COND_DESTROY(&task_cond[workerid][j]);
-		}
-	}
-#else /* !STARPU_SIMGRID */
-	workerid = worker_set->workers[0].workerid;
-	int devid = starpu_worker_get_devid(workerid);
-
-	for (i = 0; i < worker_set->nworkers; i++)
-	{
-		workerid = worker_set->workers[i].workerid;
-		devid = starpu_worker_get_devid(workerid);
-
-		for (j = 0; j < STARPU_MAX_PIPELINE; j++)
-			cudaEventDestroy(task_events[workerid][j]);
-		cudaStreamDestroy(streams[workerid]);
-	}
+	unsigned i;
 
 	cudaStreamDestroy(in_transfer_streams[devid]);
 	cudaStreamDestroy(out_transfer_streams[devid]);
@@ -364,6 +340,22 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 		cudaStreamDestroy(in_peer_transfer_streams[i][devid]);
 		cudaStreamDestroy(out_peer_transfer_streams[devid][i]);
 	}
+}
+#endif /* !STARPU_SIMGRID */
+
+static void deinit_worker_context(unsigned workerid)
+{
+	unsigned j;
+#ifdef STARPU_SIMGRID
+	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+	{
+		STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[workerid][j]);
+		STARPU_PTHREAD_COND_DESTROY(&task_cond[workerid][j]);
+	}
+#else /* STARPU_SIMGRID */
+	for (j = 0; j < STARPU_MAX_PIPELINE; j++)
+		cudaEventDestroy(task_events[workerid][j]);
+	cudaStreamDestroy(streams[workerid]);
 #endif /* STARPU_SIMGRID */
 }
 
@@ -557,52 +549,66 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
 	struct _starpu_worker *worker0 = &worker_set->workers[0];
-	unsigned devid = worker0->devid;
+	int lastdevid = -1;
 	unsigned i;
 
 	_starpu_worker_start(worker0, _STARPU_FUT_CUDA_KEY);
+	_starpu_set_local_worker_set_key(worker_set);
 
 #ifdef STARPU_USE_FXT
-	unsigned memnode = worker0->memory_node;
 	for (i = 1; i < worker_set->nworkers; i++)
 	{
 		struct _starpu_worker *worker = &worker_set->workers[i];
+		unsigned devid = worker->devid;
+		unsigned memnode = worker->memory_node;
 		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, worker->workerid, devid, memnode);
 	}
 #endif
 
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		unsigned devid = worker->devid;
+		unsigned memnode = worker->memory_node;
+		if ((int) devid == lastdevid)
+			/* Already initialized */
+			continue;
+		lastdevid = devid;
 #ifndef STARPU_SIMGRID
-	init_device_context(devid);
+		init_device_context(devid);
 #endif
 
 #ifdef STARPU_SIMGRID
-	STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
+		STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
 #else /* !STARPU_SIMGRID */
-	if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
-		_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
+		if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
+			_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
 #endif /* !STARPU_SIMGRID */
 
-	_starpu_cuda_limit_gpu_mem_if_needed(devid);
-	_starpu_memory_manager_set_global_memory_size(worker0->memory_node, _starpu_cuda_get_global_mem_size(devid));
+		_starpu_cuda_limit_gpu_mem_if_needed(devid);
+		_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
 
-	_starpu_malloc_init(worker0->memory_node);
+		_starpu_malloc_init(memnode);
+	}
 
 	/* one more time to avoid hacks from third party lib :) */
 	_starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
 
-	float size = (float) global_mem[devid] / (1<<30);
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		unsigned devid = worker->devid;
+		unsigned workerid = worker->workerid;
+
+		float size = (float) global_mem[devid] / (1<<30);
 #ifdef STARPU_SIMGRID
-	const char *devname = "Simgrid";
+		const char *devname = "Simgrid";
 #else
-	/* get the device's name */
-	char devname[128];
-	strncpy(devname, props[devid].name, 128);
+		/* get the device's name */
+		char devname[128];
+		strncpy(devname, props[devid].name, 128);
 #endif
 
-	for (i = 0; i < worker_set->nworkers; i++)
-	{
-		struct _starpu_worker *worker = &worker_set->workers[i];
-		unsigned workerid = worker_set->workers[i].workerid;
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
 		if (props[devid].pciDomainID)
@@ -801,22 +807,42 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	return 0;
 }
 
-int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
+int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 {
-	struct _starpu_worker *worker = &arg->workers[0];
-	unsigned memnode = worker->memory_node;
+	int lastdevid = -1;
+	unsigned i;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		unsigned devid = worker->devid;
+		unsigned memnode = worker->memory_node;
+		if ((int) devid == lastdevid)
+			/* Already initialized */
+			continue;
+		lastdevid = devid;
+
+		_starpu_handle_all_pending_node_data_requests(memnode);
 
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
+		/* In case there remains some memory that was automatically
+		 * allocated by StarPU, we release it now. Note that data
+		 * coherency is not maintained anymore at that point ! */
+		_starpu_free_all_automatically_allocated_buffers(memnode);
 
-	_starpu_malloc_shutdown(memnode);
+		_starpu_malloc_shutdown(memnode);
 
-	deinit_context(arg);
+#ifndef STARPU_SIMGRID
+		deinit_device_context(devid);
+#endif /* !STARPU_SIMGRID */
+	}
+
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		unsigned workerid = worker_set->workers[i].workerid;
+
+		deinit_worker_context(workerid);
+	}
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);