瀏覽代碼

Add CUDA concurrent kernel execution support through the STARPU_NWORKER_PER_CUDA environment variable.

Samuel Thibault 11 年之前
父節點
當前提交
092f322b1c

+ 2 - 0
ChangeLog

@@ -47,6 +47,8 @@ New features:
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
   * Add paje traces statistics tools.
+  * Add CUDA concurrent kernel execution support through
+    the STARPU_NWORKER_PER_CUDA environment variable.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 3 - 3
src/common/fxt.h

@@ -288,8 +288,8 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode)	\
 	FUT_DO_PROBE5(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, _starpu_gettid());
 
-#define _STARPU_TRACE_WORKER_INIT_END				\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid());
+#define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
@@ -646,7 +646,7 @@ do {										\
 /* Dummy macros in case FxT is disabled */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
-#define _STARPU_TRACE_WORKER_INIT_END		do {} while(0)
+#define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
 #define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)

+ 24 - 16
src/core/topology.c

@@ -821,6 +821,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	int ncuda = config->conf->ncuda;
+	int nworker_per_cuda = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+
+	STARPU_ASSERT_MSG(nworker_per_cuda > 0, "STARPU_NWORKER_PER_CUDA has to be > 0");
 
 	if (ncuda != 0)
 	{
@@ -855,25 +858,30 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
 	{
-		int worker_idx = topology->nworkers + cudagpu;
-		config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 		int devid = _starpu_get_next_cuda_gpuid(config);
-		config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
-		config->workers[worker_idx].perf_arch.devid = cudagpu;
-		config->workers[worker_idx].perf_arch.ncore = 0;
-		config->workers[worker_idx].devid = devid;
-		config->workers[worker_idx].subworkerid = 0;
-		config->workers[worker_idx].worker_mask = STARPU_CUDA;
-		config->worker_mask |= STARPU_CUDA;
-
-		struct handle_entry *entry;
-		entry = (struct handle_entry *) malloc(sizeof(*entry));
-		STARPU_ASSERT(entry != NULL);
-		entry->gpuid = devid;
-		HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+		for (i = 0; i < nworker_per_cuda; i++)
+		{
+			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
+			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.devid = devid;
+			// TODO: fix perfmodels etc.
+			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
+			config->workers[worker_idx].perf_arch.ncore = 0;
+			config->workers[worker_idx].devid = devid;
+			config->workers[worker_idx].subworkerid = i;
+			config->workers[worker_idx].worker_mask = STARPU_CUDA;
+			config->worker_mask |= STARPU_CUDA;
+
+			struct handle_entry *entry;
+			entry = (struct handle_entry *) malloc(sizeof(*entry));
+			STARPU_ASSERT(entry != NULL);
+			entry->gpuid = devid;
+			HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+		}
         }
 
-	topology->nworkers += topology->ncudagpus;
+	topology->nworkers += topology->ncudagpus * nworker_per_cuda;
 #endif
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)

+ 36 - 22
src/core/workers.c

@@ -326,6 +326,9 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
  * Runtime initialization methods
  */
 
+#ifdef STARPU_USE_CUDA
+static struct _starpu_worker_set cuda_worker_set[STARPU_MAXCUDADEVS];
+#endif
 #ifdef STARPU_USE_MIC
 static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 #endif
@@ -512,7 +515,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
-#ifdef STARPU_USE_MIC
+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 		unsigned devid = workerarg->devid;
 #endif
 
@@ -560,12 +563,22 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				driver.id.cuda_id = workerarg->devid;
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
+					/* We spawn only one thread per CUDA device,
+					 * which will control all CUDA workers of this
+					 * device. (by using a worker set). */
+					if (cuda_worker_set[devid].started)
+						goto worker_set_initialized;
+
+					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+					cuda_worker_set[devid].workers = workerarg;
+					cuda_worker_set[devid].set_is_initialized = 0;
+
 					STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
-						&workerarg->worker_thread,
+						&cuda_worker_set[devid].worker_thread,
 						NULL,
 						_starpu_cuda_worker,
-						workerarg,
+						&cuda_worker_set[devid],
 						worker+1);
 #ifdef STARPU_USE_FXT
 					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
@@ -573,6 +586,14 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
+					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
+					while (!cuda_worker_set[devid].set_is_initialized)
+						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
+									 &cuda_worker_set[devid].mutex);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
+					cuda_worker_set[devid].started = 1;
+		worker_set_initialized:
+					workerarg->set = &cuda_worker_set[devid];
 				}
 				else
 				{
@@ -697,14 +718,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				cpu++;
 				break;
 			case STARPU_CUDA_WORKER:
-				driver.id.cuda_id = workerarg->devid;
-				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
-					break;
-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-				while (!workerarg->worker_is_initialized)
-					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				/* Already waited above */
 				break;
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 			case STARPU_OPENCL_WORKER:
@@ -1732,11 +1746,17 @@ int _starpu_worker_get_nsched_ctxs(int workerid)
 	return config.workers[workerid].nsched_ctxs;
 }
 
-static struct _starpu_worker *
+static void *
 _starpu_get_worker_from_driver(struct starpu_driver *d)
 {
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned workerid;
+
+#ifdef STARPU_USE_CUDA
+	if (d->type == STARPU_CUDA_WORKER)
+		return &cuda_worker_set[d->id.cuda_id];
+#endif
+
 	for (workerid = 0; workerid < nworkers; workerid++)
 	{
 		if (starpu_worker_get_type(workerid) == d->type)
@@ -1751,12 +1771,6 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 					return worker;
 				break;
 #endif
-#ifdef STARPU_USE_CUDA
-			case STARPU_CUDA_WORKER:
-				if (worker->devid == d->id.cuda_id)
-					return worker;
-				break;
-#endif
 #ifdef STARPU_USE_OPENCL
 			case STARPU_OPENCL_WORKER:
 			{
@@ -1786,7 +1800,7 @@ starpu_driver_run(struct starpu_driver *d)
 		return -EINVAL;
 	}
 
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1812,7 +1826,7 @@ int
 starpu_driver_init(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1837,7 +1851,7 @@ int
 starpu_driver_run_once(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1862,7 +1876,7 @@ int
 starpu_driver_deinit(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{

+ 9 - 2
src/debug/traces/starpu_fxt.c

@@ -164,7 +164,9 @@ static void register_worker_id(unsigned long tid, int workerid)
 	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
 
 	/* only register a thread once */
-	STARPU_ASSERT(entry == NULL);
+	//STARPU_ASSERT(entry == NULL);
+	if (entry)
+		return;
 
 	entry = malloc(sizeof(*entry));
 	entry->tid = tid;
@@ -423,12 +425,17 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	char *prefix = options->file_prefix;
+	int worker;
 
 	if (out_paje_file)
 		worker_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
 
+	if (ev->nb_params < 2)
+		worker = find_worker_id(ev->param[0]);
+	else
+		worker = ev->param[1];
+
 	/* Initilize the accumulated time counters */
-	int worker = find_worker_id(ev->param[0]);
 	last_activity_flush_timestamp[worker] = get_event_time_stamp(ev, options);
 	accumulated_sleep_time[worker] = 0.0;
 	accumulated_exec_time[worker] = 0.0;

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -176,7 +176,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
 	cpu_worker->status = STATUS_UNKNOWN;
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(cpu_worker->workerid);
 
 	/* tell the main thread that we are ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&cpu_worker->mutex);

+ 140 - 80
src/drivers/cuda/driver_cuda.c

@@ -37,7 +37,7 @@
 #endif
 
 /* the number of CUDA devices */
-static int ncudagpus;
+static unsigned ncudagpus;
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
 #ifdef STARPU_USE_CUDA
@@ -196,11 +196,11 @@ done:
 }
 
 #ifndef STARPU_SIMGRID
-static void init_context(unsigned devid)
+static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
-	int i;
+	unsigned i;
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
@@ -251,23 +251,26 @@ static void init_context(unsigned devid)
 	}
 #endif
 
-	workerid = starpu_worker_get_id();
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		workerid = worker_set->workers[i].workerid;
 
-	cures = cudaEventCreate(&task_events[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaEventCreateWithFlags(&task_events[workerid], cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&streams[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&in_transfer_streams[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&in_transfer_streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&out_transfer_streams[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&out_transfer_streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
 
 	for (i = 0; i < ncudagpus; i++)
 	{
@@ -277,16 +280,23 @@ static void init_context(unsigned devid)
 	}
 }
 
-static void deinit_context(int workerid)
+static void deinit_context(struct _starpu_worker_set *worker_set)
 {
 	cudaError_t cures;
-	int devid = starpu_worker_get_devid(workerid);
-	int i;
+	unsigned i;
+	int workerid, devid;
+
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		workerid = worker_set->workers[i].workerid;
+		devid = starpu_worker_get_devid(workerid);
+
+		cudaEventDestroy(task_events[workerid]);
+		cudaStreamDestroy(streams[workerid]);
+		cudaStreamDestroy(in_transfer_streams[workerid]);
+		cudaStreamDestroy(out_transfer_streams[workerid]);
+	}
 
-	cudaEventDestroy(task_events[workerid]);
-	cudaStreamDestroy(streams[workerid]);
-	cudaStreamDestroy(in_transfer_streams[workerid]);
-	cudaStreamDestroy(out_transfer_streams[workerid]);
 	for (i = 0; i < ncudagpus; i++)
 		cudaStreamDestroy(peer_transfer_streams[i][devid]);
 
@@ -347,7 +357,6 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 	STARPU_ASSERT(cl);
 
 	_starpu_set_current_task(task);
-	args->current_task = j->task;
 
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
@@ -399,14 +408,25 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 }
 
 /* XXX Should this be merged with _starpu_init_cuda ? */
-int _starpu_cuda_driver_init(struct _starpu_worker *args)
+int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
+	struct _starpu_worker *args = &worker_set->workers[0];
 	unsigned devid = args->devid;
+	unsigned i;
 
 	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
 
+#ifdef STARPU_USE_FXT
+	unsigned memnode = args->memory_node;
+	for (i = 1; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, worker->workerid, devid, memnode);
+	}
+#endif
+
 #ifndef STARPU_SIMGRID
-	init_context(devid);
+	init_context(worker_set, devid);
 #endif
 
 	_starpu_cuda_limit_gpu_mem_if_needed(devid);
@@ -441,7 +461,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker *args)
 	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", devid);
 	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
+	}
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
@@ -449,93 +473,132 @@ int _starpu_cuda_driver_init(struct _starpu_worker *args)
 	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
+	/* tell the main thread that this one is ready */
+	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+	worker_set->set_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
+
 	return 0;
 }
 
-int _starpu_cuda_driver_run_once(struct _starpu_worker *args)
+int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 {
-	unsigned memnode = args->memory_node;
-	int workerid = args->workerid;
-
-	struct starpu_task *task;
+	struct _starpu_worker *worker0 = &worker_set->workers[0];
+	unsigned memnode = worker0->memory_node;
+	struct starpu_task *tasks[worker_set->nworkers], *task;
 	struct _starpu_job *j;
+	int i, res, idle;
 
-	task = args->current_task;
-
-	if (task)
+	/* First poll for completed jobs */
+	idle = 0;
+	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
+		struct _starpu_worker *args = &worker_set->workers[i];
+		int workerid = args->workerid;
+
+		task = args->current_task;
+
+		if (!task)
+		{
+			idle++;
+			continue;
+		}
+
 		/* On-going asynchronous task, check for its termination first */
 		cudaError_t cures = cudaEventQuery(task_events[workerid]);
 
 		if (cures != cudaSuccess)
 		{
-			/* Not ready yet, no better thing to do than waiting */
-			__starpu_datawizard_progress(memnode, 1, 0);
-
 			STARPU_ASSERT(cures == cudaErrorNotReady);
-			return 0;
+			idle++;
 		}
+		else
+		{
+			/* Asynchronous task completed! */
+			_starpu_set_local_worker_key(args);
+			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+		}
+	}
 
-		/* Asynchronous task completed! */
-		finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+	if (!idle)
+	{
+		/* Nothing ready yet, no better thing to do than waiting */
+		__starpu_datawizard_progress(memnode, 1, 0);
+		return 0;
 	}
 
+	/* Something done, make some progress */
 	__starpu_datawizard_progress(memnode, 1, 1);
 
-	task = _starpu_get_worker_task(args, workerid, memnode);
+	/* And pull tasks */
+	res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers);
 
-	if (!task)
+	if (!res)
 		return 0;
 
-	j = _starpu_get_job_associated_to_task(task);
-
-	/* can CUDA do that task ? */
-	if (!_STARPU_CUDA_MAY_PERFORM(j))
+	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		/* this is neither a cuda or a cublas task */
-		_starpu_push_task_to_workers(task);
-		return 0;
-	}
+		struct _starpu_worker *args = &worker_set->workers[i];
+		int workerid = args->workerid;
 
-	_STARPU_TRACE_END_PROGRESS(memnode);
-	int res = start_job_on_cuda(j, args);
+		task = tasks[i];
+		if (!task)
+			continue;
 
-	if (res)
-	{
-		switch (res)
+		_starpu_set_local_worker_key(args);
+
+		j = _starpu_get_job_associated_to_task(task);
+
+		/* can CUDA do that task ? */
+		if (!_STARPU_CUDA_MAY_PERFORM(j))
 		{
-			case -EAGAIN:
-				_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
-				_starpu_push_task_to_workers(task);
-				STARPU_ABORT();
-			default:
-				STARPU_ABORT();
+			/* this is neither a cuda or a cublas task */
+			_starpu_push_task_to_workers(task);
+			continue;
+		}
+
+		_STARPU_TRACE_END_PROGRESS(memnode);
+		res = start_job_on_cuda(j, args);
+
+		if (res)
+		{
+			switch (res)
+			{
+				case -EAGAIN:
+					_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
+					_starpu_push_task_to_workers(task);
+					STARPU_ABORT();
+				default:
+					STARPU_ABORT();
+			}
 		}
-	}
 
 #ifndef STARPU_SIMGRID
-	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
-	{
-		/* Record event to synchronize with task termination later */
-		cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
-	}
-	else
+		if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+		{
+			/* Record event to synchronize with task termination later */
+			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
+		}
+		else
 #else
 #ifdef STARPU_DEVEL
 #warning No CUDA asynchronous execution with simgrid yet.
 #endif
 #endif
-	/* Synchronous execution */
-	{
-		finish_job_on_cuda(j, args);
+		/* Synchronous execution */
+		{
+			finish_job_on_cuda(j, args);
+		}
+		_STARPU_TRACE_START_PROGRESS(memnode);
 	}
-	_STARPU_TRACE_START_PROGRESS(memnode);
 
 	return 0;
 }
 
-int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
+int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 {
+	struct _starpu_worker *args = &arg->workers[0];
 	unsigned memnode = args->memory_node;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
@@ -549,7 +612,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
 	_starpu_malloc_shutdown(memnode);
 
 #ifndef STARPU_SIMGRID
-	deinit_context(args->workerid);
+	deinit_context(arg);
 #endif
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
@@ -559,7 +622,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
 
 void *_starpu_cuda_worker(void *arg)
 {
-	struct _starpu_worker* args = arg;
+	struct _starpu_worker_set* args = arg;
 
 	_starpu_cuda_driver_init(args);
 	_STARPU_TRACE_START_PROGRESS(memnode);
@@ -684,11 +747,8 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 }
 #endif /* STARPU_USE_CUDA */
 
-int _starpu_run_cuda(struct _starpu_worker *workerarg)
+int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
 {
-	workerarg->set = NULL;
-	workerarg->worker_is_initialized = 0;
-
 	/* Let's go ! */
 	_starpu_cuda_worker(workerarg);
 

+ 5 - 5
src/drivers/cuda/driver_cuda.h

@@ -52,11 +52,11 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
 cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
-struct _starpu_worker;
-int _starpu_run_cuda(struct _starpu_worker *);
-int _starpu_cuda_driver_init(struct _starpu_worker *);
-int _starpu_cuda_driver_run_once(struct _starpu_worker *);
-int _starpu_cuda_driver_deinit(struct _starpu_worker *);
+struct _starpu_worker_set;
+int _starpu_run_cuda(struct _starpu_worker_set *);
+int _starpu_cuda_driver_init(struct _starpu_worker_set *);
+int _starpu_cuda_driver_run_once(struct _starpu_worker_set *);
+int _starpu_cuda_driver_deinit(struct _starpu_worker_set *);
 #endif
 
 #endif //  __DRIVER_CUDA_H__

+ 10 - 1
src/drivers/mic/driver_mic_source.c

@@ -518,6 +518,11 @@ void *_starpu_mic_src_worker(void *arg)
 	/* unsigned memnode = baseworker->memory_node; */
 
 	_starpu_worker_start(baseworker, _STARPU_FUT_MIC_KEY);
+	for (i = 1; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(workerid);
+	}
 
 	// Current task for a thread managing a worker set has no sense.
 	_starpu_set_current_task(NULL);
@@ -530,7 +535,11 @@ void *_starpu_mic_src_worker(void *arg)
 
 	baseworker->status = STATUS_UNKNOWN;
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(workerid);
+	}
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);

+ 2 - 1
src/drivers/opencl/driver_opencl.c

@@ -568,6 +568,7 @@ static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker
 int _starpu_opencl_driver_init(struct _starpu_worker *args)
 {
 	int devid = args->devid;
+	int workerid = args->workerid;
 
 	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
 
@@ -598,7 +599,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *args)
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(workerid);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);

+ 1 - 1
src/drivers/scc/driver_scc_source.c

@@ -303,7 +303,7 @@ void *_starpu_scc_src_worker(void *arg)
 		snprintf(worker->name, sizeof(worker->name), "MIC %d core %u", devid, i);
 	}
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(workerid);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);

+ 8 - 0
tests/Makefile.am

@@ -220,6 +220,7 @@ noinst_PROGRAMS =				\
 	microbenchs/local_pingpong		\
 	microbenchs/matrix_as_vector		\
 	overlap/overlap				\
+	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
@@ -609,6 +610,13 @@ datawizard_interfaces_void_void_interface_SOURCES=\
 	datawizard/interfaces/void/void_interface.c
 
 
+overlap_gpu_concurrency_SOURCES=\
+	overlap/gpu_concurrency.c
+if STARPU_USE_CUDA
+overlap_gpu_concurrency_SOURCES+=\
+	overlap/long_kernel.cu
+endif
+
 perfmodels_regression_based_SOURCES=\
 	perfmodels/regression_based.c
 

+ 87 - 0
tests/overlap/gpu_concurrency.c

@@ -0,0 +1,87 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <starpu.h>
+#include <stdlib.h>
+#include "../helper.h"
+#include <common/thread.h>
+
+#define NITERS 1000000
+#define NTASKS 128
+
+#ifdef STARPU_CUDA
+extern void long_kernel_cuda(unsigned long niters);
+void codelet_long_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	long_kernel_cuda(NITERS);
+}
+
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "long_kernel",
+};
+
+static struct starpu_codelet cl =
+{
+	.cuda_funcs = {codelet_long_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 0,
+	.model =  &model
+};
+#endif
+
+int main(int argc, char **argv)
+{
+#ifndef STARPU_CUDA
+	return STARPU_TEST_SKIPPED;
+#else
+	int ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	if (starpu_cuda_worker_get_count() == 0)
+	{
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	unsigned iter;
+	for (iter = 0; iter < NTASKS; iter++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_shutdown();
+
+	STARPU_RETURN(EXIT_SUCCESS);
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	STARPU_RETURN(STARPU_TEST_SKIPPED);
+#endif
+}

+ 33 - 0
tests/overlap/long_kernel.cu

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+extern "C" __global__
+void long_kernel(unsigned long niters)
+{
+	unsigned long i;
+	for (i = 0; i < niters; i++)
+		__syncthreads();
+}
+
+extern "C"
+void long_kernel_cuda(unsigned long niters)
+{
+	dim3 dimBlock(1,1);
+	dim3 dimGrid(1,1);
+	long_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(niters);
+}