11 years ago · a4b84da340
--- a/ChangeLog
+++ b/ChangeLog
@@ -47,7 +47,7 @@ New features:
 
				   * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
			
 
				   * New functions starpu_pause() and starpu_resume()
			
 
				   * New codelet specific_nodes field to specify explicit target nodes for data.
			
 
				-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
			
 
				+  * Use streams for all CUDA transfers, even initiated by CPUs.
			
 
				   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
			
 
				     CUDA and OpenCL kernel execution.
			
 
				   * Add paje traces statistics tools.
			
--- a/configure.ac
+++ b/configure.ac
@@ -853,6 +853,10 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 
				 	  enable_opencl=$have_valid_opencl
			
 
				           ;;
			
 
				    esac
			
 
				+   save_LIBS="$LIBS"
			
 
				+   LIBS="$LIBS $STARPU_OPENCL_LDFLAGS"
			
 
				+   AC_CHECK_FUNCS([clEnqueueMarkerWithWaitList])
			
 
				+   LIBS="$save_LIBS"
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether OpenCL should be used)
			
@@ -1365,6 +1369,7 @@ AC_MSG_RESULT($enable_debug)
 
				 AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
			
 
				 
			
 
				 if test x$enable_debug = xyes; then
			
 
				+	AC_DEFINE(STARPU_DEBUG, [1], [enable debugging statements])
			
 
				 	CFLAGS="$CFLAGS -O0"
			
 
				 	enable_spinlock_check=yes
			
 
				 else
			
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -52,29 +52,37 @@ func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
 
				 cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 \endcode
			
 
				 
			
 
				+Unfortunately, some CUDA libraries do not have stream variants of
			
 
				+kernels. That will lower the potential for overlapping.
			
 
				+
			
 
				 StarPU already does appropriate calls for the CUBLAS library.
			
 
				 
			
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				-one should enable asynchronous execution of the kernel. This means setting
			
 
				-the corresponding cuda_flags[] flag in the codelet and dropping the
			
 
				-cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
			
 
				-able to pipeline submitting tasks to GPUs, instead of synchronizing at each
			
 
				+one should enable asynchronous execution of the kernel.  That means setting
			
 
				+the STARPU_CUDA_ASYNC flag in cuda_flags[] in the codelet, and dropping the
			
 
				+cudaStreamSynchronize() call at the end of the cuda_func function, so that it
			
 
				+returns immediately after having queued the kernel to the local stream. That way, StarPU will be
			
 
				+able to submit and complete data transfers while kernels are executing, instead of only at each
			
 
				 kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				 local stream to synchronize with the kernel startup and completion.
			
 
				 
			
 
				-Unfortunately, some CUDA libraries do not have stream variants of
			
 
				-kernels. That will lower the potential for overlapping.
			
 
				+Using the STARPU_CUDA_ASYNC flag also permits to enable concurrent kernel
			
 
				+execution, on cards which support it (Kepler and later, notably). This is
			
 
				+enabled by setting the STARPU_NWORKER_PER_CUDA environment variable to the
			
 
				+number of kernels to execute concurrently.  This is useful when kernels are
			
 
				+small and do not feed the whole GPU with threads to run.
			
 
				 
			
 
				 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
			
 
				 
			
 
				 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
			
 
				-streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				+queues, i.e. the whole kernel submission can be made asynchronous, then
			
 
				 one should enable asynchronous execution of the kernel. This means setting
			
 
				 the corresponding opencl_flags[] flag in the codelet and dropping the
			
 
				-clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
			
 
				-That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
			
 
				-synchronizing at each kernel submission. The kernel just has to make sure
			
 
				+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel, so
			
 
				+that it returns immediately after having queued the kernel to the provided queue.
			
 
				+That way, StarPU will be able to submit and complete data transfers while kernels are executing, instead of
			
 
				+only at each kernel submission. The kernel just has to make sure
			
 
				 that StarPU can use the command queue it has provided to synchronize with the
			
 
				 kernel startup and completion.
			
 
				 
			
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -43,6 +43,14 @@ environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
 
				 create as many CUDA workers as there are CUDA devices.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_NWORKER_PER_CUDA</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NWORKER_PER_CUDA
			
 
				+\addindex __env__STARPU_NWORKER_PER_CUDA
			
 
				+Specify the number of workers per CUDA device, and thus the number of kernels
			
 
				+which will be concurrently running on the devices. The default value is 1.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_NOPENCL</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_NOPENCL
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -101,6 +101,7 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	cublasSaxpy(n, 1., x, 1, y, 1);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -142,6 +143,7 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 
				 
			
 
				 	y = cublasSasum(n, x, 1);
			
 
				 
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
			
 
				 }
			
 
				 #endif
			
--- a/examples/sched_ctx/parallel_code.c
+++ b/examples/sched_ctx/parallel_code.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,17 +36,13 @@ int parallel_code(int sched_ctx)
 
				 	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
			
 
				 
			
 
				 //	printf("execute task of %d threads \n", ncpuids);
			
 
				-	omp_set_nested(1);
			
 
				-#pragma omp parallel num_threads(1)
			
 
				-	{
			
 
				 #pragma omp parallel num_threads(ncpuids)
			
 
				-		{
			
 
				-			starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				+	{
			
 
				+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				 // 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
			
 
				 #pragma omp for
			
 
				-			for(i = 0; i < NTASKS; i++)
			
 
				-				t++;
			
 
				-		}
			
 
				+		for(i = 0; i < NTASKS; i++)
			
 
				+			t++;
			
 
				 	}
			
 
				 	free(cpuids);
			
 
				 	return t;
			
@@ -104,9 +100,8 @@ int main(int argc, char **argv)
 
				 #else
			
 
				 	procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				 	procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				-	procs1[0] = 0:
			
 
				-	procs2[0] = 0:
			
 
				-
			
 
				+	procs1[0] = 0;
			
 
				+	procs2[0] = 0;
			
 
				 #endif
			
 
				 
			
 
				 	int p;
			
--- a/include/starpu_tree.h
+++ b/include/starpu_tree.h
@@ -41,7 +41,7 @@ struct starpu_tree *starpu_tree_get(struct starpu_tree *tree, int id);
 
				 
			
 
				 struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, int *visited, int *present);
			
 
				 
			
 
				-int starpu_tree_free(struct starpu_tree *tree);
			
 
				+void starpu_tree_free(struct starpu_tree *tree);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1646,7 +1646,7 @@ void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, uns
 
				 {
			
 
				 	int *workerids;
			
 
				 	int nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
			
 
				-	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, workerids, nworkers, -1);
			
 
				+	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, workerids, nworkers, workerids[nworkers-1]);
			
 
				 
			
 
				 	/* bind current thread on all workers of the context */
			
 
				 	_starpu_sched_ctx_bind_thread_to_ctx_cpus(sched_ctx_id);
			
@@ -1655,7 +1655,7 @@ void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, uns
 
				 	void* ret = func(param);
			
 
				 
			
 
				 	/* wake up starpu workers */
			
 
				-	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, -1);
			
 
				+	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, workerids[nworkers-1]);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/src/core/tree.c
+++ b/src/core/tree.c
@@ -138,19 +138,16 @@ struct starpu_tree* starpu_tree_get_neighbour(struct starpu_tree *tree, struct s
 
				 	return starpu_tree_get_neighbour(tree, father, visited, present);
			
 
				 }
			
 
				 
			
 
				-int starpu_tree_free(struct starpu_tree *tree)
			
 
				+void starpu_tree_free(struct starpu_tree *tree)
			
 
				 {
			
 
				-	if(tree->arity == 0)
			
 
				-		return 1;
			
 
				 	int i;
			
 
				 	for(i = 0; i < tree->arity; i++)
			
 
				 	{
			
 
				-		if(starpu_tree_free(tree->nodes[i]))
			
 
				-		{
			
 
				-			free(tree->nodes);
			
 
				-			tree->arity = 0;
			
 
				-			return 1;
			
 
				-		}
			
 
				+		starpu_tree_free(tree->nodes[i]);
			
 
				+		free(tree->nodes[i]);
			
 
				+		tree->nodes[i] = NULL;
			
 
				 	}
			
 
				-	return 0;
			
 
				+	free(tree->nodes);
			
 
				+	tree->nodes = NULL;
			
 
				+	tree->arity = 0;
			
 
				 }
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -900,11 +900,7 @@ static void _starpu_build_tree(void)
 
				 	struct starpu_tree* tree = (struct starpu_tree*)malloc(sizeof(struct starpu_tree));
			
 
				 	config.topology.tree = tree;
			
 
				 
			
 
				-	hwloc_topology_t topology;
			
 
				-	hwloc_topology_init(&topology);
			
 
				-	hwloc_topology_load(topology);
			
 
				-
			
 
				-	hwloc_obj_t root = hwloc_get_root_obj(topology);
			
 
				+	hwloc_obj_t root = hwloc_get_root_obj(config.topology.hwtopology);
			
 
				 
			
 
				 /* 	char string[128]; */
			
 
				 /* 	hwloc_obj_snprintf(string, sizeof(string), topology, root, "#", 0); */
			
@@ -912,7 +908,7 @@ static void _starpu_build_tree(void)
 
				 
			
 
				 	/* level, is_pu, is in the tree (it will be true only after add*/
			
 
				 	starpu_tree_insert(tree, root->logical_index, 0,root->type == HWLOC_OBJ_PU, root->arity, NULL);
			
 
				-	_fill_tree(tree, root, 1, topology);
			
 
				+	_fill_tree(tree, root, 1, config.topology.hwtopology);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -1320,7 +1316,7 @@ void starpu_shutdown(void)
 
				 	_starpu_delete_all_sched_ctxs();
			
 
				 
			
 
				 	_starpu_disk_unregister();
			
 
				-
			
 
				+	starpu_tree_free(config.topology.tree);
			
 
				 	_starpu_destroy_topology(&config);
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_starpu_stop_fxt_profiling();
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -194,13 +194,7 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
			
 
				 			/* GPUs not always allow direct remote access: if CUDA4
			
 
				 			 * is enabled, we allow two CUDA devices to communicate. */
			
 
				-			return
			
 
				-#if 0
			
 
				-				/* CUDA does not seem very safe with concurrent
			
 
				-				 * transfer queueing, avoid queueing from CPUs */
			
 
				-				kind == STARPU_CPU_RAM ||
			
 
				-#endif
			
 
				-				kind == STARPU_CUDA_RAM;
			
 
				+			return kind == STARPU_CPU_RAM || kind == STARPU_CUDA_RAM;
			
 
				 		}
			
 
				 #else
			
 
				 			/* Direct GPU-GPU transfers are not allowed in general */
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -163,7 +163,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_out_transfer_stream();
			
 
				+			stream = starpu_cuda_get_out_transfer_stream(src_node);
			
 
				 			if (copy_methods->cuda_to_ram_async)
			
 
				 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 			else
			
@@ -199,7 +199,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-			stream = starpu_cuda_get_local_in_transfer_stream();
			
 
				+			stream = starpu_cuda_get_in_transfer_stream(dst_node);
			
 
				 			if (copy_methods->ram_to_cuda_async)
			
 
				 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 			else
			
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
				 				(void*) src + src_offset, src_node,
			
 
				 				(void*) dst + dst_offset, dst_node,
			
 
				 				size,
			
 
				-				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
			
 
				+				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
			
 
				 				cudaMemcpyDeviceToHost);
			
 
				 
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
				 				(void*) src + src_offset, src_node,
			
 
				 				(void*) dst + dst_offset, dst_node,
			
 
				 				size,
			
 
				-				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
			
 
				+				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
			
 
				 				cudaMemcpyHostToDevice);
			
 
				 
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -259,7 +259,7 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
				 	if (mc->automatically_allocated &&
			
 
				 		(!handle || replicate->refcnt == 0))
			
 
				 	{
			
 
				-		void *interface;
			
 
				+		void *data_interface;
			
 
				 
			
 
				 		if (handle)
			
 
				 			STARPU_ASSERT(replicate->allocated);
			
@@ -276,13 +276,13 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
				 #endif
			
 
				 
			
 
				 		if (handle)
			
 
				-			interface = replicate->data_interface;
			
 
				+			data_interface = replicate->data_interface;
			
 
				 		else
			
 
				-			interface = mc->chunk_interface;
			
 
				-		STARPU_ASSERT(interface);
			
 
				+			data_interface = mc->chunk_interface;
			
 
				+		STARPU_ASSERT(data_interface);
			
 
				 
			
 
				 		_STARPU_TRACE_START_FREE(node, mc->size);
			
 
				-		mc->ops->free_data_on_node(interface, node);
			
 
				+		mc->ops->free_data_on_node(data_interface, node);
			
 
				 		_STARPU_TRACE_END_FREE(node);
			
 
				 
			
 
				 		if (handle)
			
@@ -417,7 +417,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				  * therefore not in the cache. */
			
 
				 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
			
 
				 {
			
 
				-	void *interface;
			
 
				+	void *data_interface;
			
 
				 
			
 
				 	/* we found an appropriate mem chunk: so we get it out
			
 
				 	 * of the "to free" list, and reassign it to the new
			
@@ -429,18 +429,18 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 		old_replicate->allocated = 0;
			
 
				 		old_replicate->automatically_allocated = 0;
			
 
				 		old_replicate->initialized = 0;
			
 
				-		interface = old_replicate->data_interface;
			
 
				+		data_interface = old_replicate->data_interface;
			
 
				 	}
			
 
				 	else
			
 
				-		interface = mc->chunk_interface;
			
 
				+		data_interface = mc->chunk_interface;
			
 
				 
			
 
				 	new_replicate->allocated = 1;
			
 
				 	new_replicate->automatically_allocated = 1;
			
 
				 	new_replicate->initialized = 0;
			
 
				 
			
 
				 	STARPU_ASSERT(new_replicate->data_interface);
			
 
				-	STARPU_ASSERT(interface);
			
 
				-	memcpy(new_replicate->data_interface, interface, mc->size_interface);
			
 
				+	STARPU_ASSERT(data_interface);
			
 
				+	memcpy(new_replicate->data_interface, data_interface, mc->size_interface);
			
 
				 
			
 
				 	if (!old_replicate)
			
 
				 	{
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -42,8 +42,8 @@ static unsigned ncudagpus;
 
				 static size_t global_mem[STARPU_MAXCUDADEVS];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				-static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
			
 
				-static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
			
 
				+static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				+static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
			
 
				 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
			
 
				 static cudaEvent_t task_events[STARPU_NMAXWORKERS];
			
@@ -116,18 +116,18 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
			
 
				+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
			
 
				 {
			
 
				-	int worker = starpu_worker_get_id();
			
 
				+	int devid = _starpu_memory_node_get_devid(node);
			
 
				 
			
 
				-	return in_transfer_streams[worker];
			
 
				+	return in_transfer_streams[devid];
			
 
				 }
			
 
				 
			
 
				-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
			
 
				+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
			
 
				 {
			
 
				-	int worker = starpu_worker_get_id();
			
 
				+	int devid = _starpu_memory_node_get_devid(node);
			
 
				 
			
 
				-	return out_transfer_streams[worker];
			
 
				+	return out_transfer_streams[devid];
			
 
				 }
			
 
				 
			
 
				 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
			
@@ -263,11 +263,11 @@ static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-		cures = cudaStreamCreate(&in_transfer_streams[workerid]);
			
 
				+		cures = cudaStreamCreate(&in_transfer_streams[devid]);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-		cures = cudaStreamCreate(&out_transfer_streams[workerid]);
			
 
				+		cures = cudaStreamCreate(&out_transfer_streams[devid]);
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
@@ -293,8 +293,8 @@ static void deinit_context(struct _starpu_worker_set *worker_set)
 
				 
			
 
				 		cudaEventDestroy(task_events[workerid]);
			
 
				 		cudaStreamDestroy(streams[workerid]);
			
 
				-		cudaStreamDestroy(in_transfer_streams[workerid]);
			
 
				-		cudaStreamDestroy(out_transfer_streams[workerid]);
			
 
				+		cudaStreamDestroy(in_transfer_streams[devid]);
			
 
				+		cudaStreamDestroy(out_transfer_streams[devid]);
			
 
				 	}
			
 
				 
			
 
				 	for (i = 0; i < ncudagpus; i++)
			
@@ -588,6 +588,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 #endif
			
 
				 		/* Synchronous execution */
			
 
				 		{
			
 
				+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
			
 
				+			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
			
 
				+#endif
			
 
				 			finish_job_on_cuda(j, args);
			
 
				 		}
			
 
				 		_STARPU_TRACE_START_PROGRESS(memnode);
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -48,8 +48,8 @@ void *_starpu_cuda_worker(void *);
 
				 #  define _starpu_cuda_discover_devices(config) ((void) config)
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
			
 
				-cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
			
 
				+cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
			
 
				+cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
			
 
				 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
			
 
				 
			
 
				 struct _starpu_worker_set;
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -685,11 +685,16 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
				 		int err;
			
 
				 		cl_command_queue queue;
			
 
				 		starpu_opencl_get_queue(args->devid, &queue);
			
 
				-#ifdef CL_VERSION_1_2
			
 
				-		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
			
 
				-#else
			
 
				+		/* the function clEnqueueMarker is deprecated from
			
 
				+		 * OpenCL version 1.2. We would like to use the new
			
 
				+		 * function clEnqueueMarkerWithWaitList. We could do
			
 
				+		 * it by checking its availability through our own
			
 
				+		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
			
 
				+		 * and the OpenCL macro CL_VERSION_1_2. However these
			
 
				+		 * 2 macros detect the function availability in the
			
 
				+		 * ICD and not in the device implementation.
			
 
				+		 */
			
 
				 		err = clEnqueueMarker(queue, &task_events[args->devid]);
			
 
				-#endif
			
 
				 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 	else