14 年前 · eb10f6b655
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -798,9 +798,9 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				     unsigned threads_per_block = 64;
			
 
				     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				-@i{    vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);}
			
 
				+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
			
 
				 
			
 
				-@i{    cudaThreadSynchronize();}
			
 
				+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
			
 
				 @}
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -1278,6 +1278,7 @@ TODO: improve!
 
				 * Task distribution vs Data transfer::
			
 
				 * Power-based scheduling::
			
 
				 * Profiling::
			
 
				+* CUDA-specific optimizations::
			
 
				 @end menu
			
 
				 
			
 
				 Simply encapsulating application kernels into tasks already permits to
			
@@ -1384,6 +1385,21 @@ execution did happen on accelerators without penalizing performance with
 
				 the profiling overhead. More details on performance feedback are provided by the
			
 
				 next chapter.
			
 
				 
			
 
				+@node CUDA-specific optimizations
			
 
				+@section CUDA-specific optimizations
			
 
				+
			
 
				+Due to CUDA limitations, StarPU will have a hard time overlapping
			
 
				+communications and computations if the application does not use a dedicated
			
 
				+CUDA stream for its computations. StarPU provides one by the use of
			
 
				+@code{starpu_cuda_get_local_stream()}. For instance:
			
 
				+
			
 
				+@example
			
 
				+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+@end example
			
 
				+
			
 
				+Unfortunately, a lot of cuda libraries do not have stream variants of kernels.
			
 
				+
			
 
				 @c ---------------------------------------------------------------------
			
 
				 @c Performance feedback
			
 
				 @c ---------------------------------------------------------------------
			
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -19,7 +19,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				         unsigned threads_per_block = 64;
			
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
			
 
				 
			
 
				-        cudaThreadSynchronize();
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 @}
			
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
			
 
				 {
			
@@ -37,6 +38,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
			
 
				         float *multiplier = (float *)_args;
			
 
				 
			
 
				-        cuda_block<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
			
 
				-	cudaThreadSynchronize();
			
 
				+        cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_variable(float * tab)
			
 
				 {
			
@@ -26,5 +27,6 @@ extern "C" void cuda_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 {
			
 
				 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-	cuda_variable<<<1,1>>>(val);
			
 
				+	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -19,6 +19,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
@@ -40,7 +41,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, *factor);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
			
 
				 {
			
@@ -38,7 +39,7 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
				         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
			
 
				 
			
 
				         /* TODO: use more blocks and threads in blocks */
			
 
				-        fblock_cuda<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *factor);
			
 
				+        fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_incrementer(float * tab)
			
 
				 {
			
@@ -29,5 +30,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	(void)_args;
			
 
				 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	cuda_incrementer<<<1,1>>>(val);
			
 
				+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -39,6 +39,8 @@
 
				 
			
 
				 #include "sobol.h"
			
 
				 #include "sobol_gpu.h"
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 #define k_2powneg32 2.3283064E-10F
			
 
				 
			
@@ -164,5 +166,5 @@ void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float
 
				     dimBlock.x = threadsperblock;
			
 
				 
			
 
				     // Execute GPU kernel
			
 
				-    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				 }
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include "SobolQRNG/sobol_gpu.h"
			
 
				 #include "pi.h"
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 #define MAXNBLOCKS	128
			
 
				 #define MAXTHREADSPERBLOCK	256
			
@@ -109,7 +110,7 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 	STARPU_ASSERT(random_numbers);
			
 
				 	
			
 
				 	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	TYPE *random_numbers_x = &random_numbers[0];
			
 
				 	TYPE *random_numbers_y = &random_numbers[nx];
			
@@ -132,14 +133,14 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 
			
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				-	monte_carlo<<<nblocks, nthread_per_block>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, cnt);
			
 
				-	cures = cudaThreadSynchronize();
			
 
				+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
			
 
				+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -113,14 +113,14 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
				 
			
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				-	monte_carlo<<<nblocks, nthread_per_block>>>(x, y, n, per_block_cnt);
			
 
				+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, shot_cnt);
			
 
				-	cures = cudaThreadSynchronize();
			
 
				+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
			
 
				+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
--- a/examples/spmv/spmv_cuda.cu
+++ b/examples/spmv/spmv_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 #define MIN(a,b)	((a)<(b)?(a):(b))
			
 
				 
			
@@ -95,10 +96,10 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
				 	dim3 dimBlock(8, 1);
			
 
				 	dim3 dimGrid(512, 1);
			
 
				 
			
 
				-	spmv_kernel_3<<<dimGrid, dimBlock>>>(nnz, nrow, nzval, colind, rowptr,
			
 
				-						firstentry, vecin, nx_in, vecout, nx_out);
			
 
				+	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
			
 
				+		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 }
			
 
				 
			
--- a/examples/starpufft/cudax_kernels.cu
+++ b/examples/starpufft/cudax_kernels.cu
@@ -28,13 +28,13 @@
 
				 \
			
 
				 	if (n < threads_per_block) { \
			
 
				 		dim3 dimGrid(n); \
			
 
				-		func <<<dimGrid, 1>>> args; \
			
 
				+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 	} else { \
			
 
				 		dim3 dimGrid(n / threads_per_block); \
			
 
				 		dim3 dimBlock(threads_per_block); \
			
 
				-		func <<<dimGrid, dimBlock>>> args; \
			
 
				+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 	} \
			
 
				-	cudaThreadSynchronize(); \
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
 
				 
			
 
				 extern "C" __global__ void
			
 
				 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
			
@@ -83,24 +83,24 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 
				 	if (n < threads_per_dim) { \
			
 
				 		if (m < threads_per_dim) { \
			
 
				 			dim3 dimGrid(n, m); \
			
 
				-			func <<<dimGrid, 1>>> args; \
			
 
				+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 		} else { \
			
 
				 			dim3 dimGrid(1, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(n, threads_per_dim); \
			
 
				-			func <<<dimGrid, dimBlock>>> args; \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 		} \
			
 
				 	} else {  \
			
 
				 		if (m < threads_per_dim) { \
			
 
				 			dim3 dimGrid(n / threads_per_dim, 1); \
			
 
				 			dim3 dimBlock(threads_per_dim, m); \
			
 
				-			func <<<dimGrid, dimBlock>>> args; \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 		} else { \
			
 
				 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
			
 
				-			func <<<dimGrid, dimBlock>>> args; \
			
 
				+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				 		} \
			
 
				 	} \
			
 
				-	cudaThreadSynchronize(); \
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
 
				 
			
 
				 extern "C" __global__ void
			
 
				 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
			
--- a/examples/starpufft/cudax_kernels.h
+++ b/examples/starpufft/cudax_kernels.h
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <cuComplex.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 _externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
			
 
				 _externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
			
 
				 _externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
			
--- a/examples/starpufft/starpufftx1d.c
+++ b/examples/starpufft/starpufftx1d.c
@@ -72,7 +72,7 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 
				 	
			
 
				 	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /* fft1:
			
@@ -97,6 +97,8 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	if (!plan->plans[workerid].initialized1) {
			
 
				 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
			
 
				+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized1 = 1;
			
@@ -107,7 +109,7 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /* fft2:
			
@@ -132,6 +134,8 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	if (!plan->plans[workerid].initialized2) {
			
 
				 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
			
 
				+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized2 = 1;
			
@@ -141,7 +145,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
				 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
			
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/starpufft/starpufftx2d.c
+++ b/examples/starpufft/starpufftx2d.c
@@ -41,7 +41,7 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 
				 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 
			
 
				 	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /* Perform an n2,m2 fft */
			
@@ -67,6 +67,8 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	if (!plan->plans[workerid].initialized1) {
			
 
				 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
			
 
				+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized1 = 1;
			
@@ -78,7 +80,7 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
				 	/* synchronization is done after the twiddling */
			
 
				 	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -104,6 +106,8 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
				 
			
 
				 	if (!plan->plans[workerid].initialized2) {
			
 
				 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
			
 
				+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 		plan->plans[workerid].initialized2 = 1;
			
@@ -114,7 +118,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
				 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 	}
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/stencil/life_cuda.cu
+++ b/examples/stencil/life_cuda.cu
@@ -16,6 +16,7 @@
 
				 
			
 
				 #define _externC extern "C"
			
 
				 #include "stencil.h"
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 /* Heart of the stencil computation: compute a new state from an old one. */
			
 
				 
			
@@ -72,5 +73,5 @@ cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int n
 
				 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
			
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				-	cuda_life_update <<<dimGrid, dimBlock>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
			
 
				+	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
			
 
				 }
			
--- a/examples/stencil/shadow.cu
+++ b/examples/stencil/shadow.cu
@@ -16,6 +16,7 @@
 
				 
			
 
				 #define _externC extern "C"
			
 
				 #include "stencil.h"
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 /* Perform replication of data on X and Y edges, to fold the domain on 
			
 
				    itself through mere replication of the source state. */
			
@@ -54,5 +55,5 @@ cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, in
 
				 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
			
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				-	cuda_shadow <<<dimGrid, dimBlock>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
			
 
				+	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
			
 
				 }
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -173,7 +173,7 @@ static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
 
				 	unsigned offset = firstz*block->ldz;
			
 
				 	TYPE *block_data = (TYPE *)block->ptr;
			
 
				 	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				-	cudaMemcpy(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice);
			
 
				+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -243,12 +243,12 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
				 #ifdef LIFE
			
 
				 		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
			
 
				 #else
			
 
				-		cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
			
 
				+		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 #endif /* LIFE */
			
 
				 	}
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
			
 
				+	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 }
			
@@ -407,7 +407,7 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
				 	unsigned offset = firstz*block->ldz;
			
 
				 	TYPE *block_data = (TYPE *)block->ptr;
			
 
				 	TYPE *boundary_data = (TYPE *)boundary->ptr;
			
 
				-	cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
			
 
				+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
@@ -459,7 +459,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
				 
			
 
				 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
			
 
				 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /* bottom save, CUDA version */
			
@@ -473,7 +473,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
				 
			
 
				 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
			
 
				 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -79,7 +79,7 @@ extern "C" {
 
				 		STARPU_CUBLAS_OOPS();					\
			
 
				 	} while (0)
			
 
				 
			
 
				-cudaStream_t *starpu_cuda_get_local_stream(void);
			
 
				+cudaStream_t starpu_cuda_get_local_stream(void);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -66,9 +66,9 @@ struct starpu_data_copy_methods {
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* for asynchronous CUDA transfers */
			
 
				-	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				-	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				-	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				+	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void cuda_incrementer(unsigned *token)
			
 
				 {
			
@@ -26,5 +27,6 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 
				 	(void) _args;
			
 
				 	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	cuda_incrementer<<<1,1>>>(tokenptr);
			
 
				+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -102,7 +102,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t cures;
			
 
				-	cudaStream_t *stream;
			
 
				+	cudaStream_t stream;
			
 
				 #endif
			
 
				 
			
 
				 	void *src_interface = src_replicate->interface;
			
@@ -132,7 +132,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 				stream = starpu_cuda_get_local_stream();
			
 
				 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				-				cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
			
 
				+				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				 				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 			}
			
 
				 		}
			
@@ -157,7 +157,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 			stream = starpu_cuda_get_local_stream();
			
 
				 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				-			cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
			
 
				+			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
 
				 		break;
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -31,8 +31,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -435,7 +435,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -457,7 +457,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-					nx*ny*nz*elemsize, kind, *stream);
			
 
				+					nx*ny*nz*elemsize, kind, stream);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
@@ -476,7 +476,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 			/* Are all plans contiguous */
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				 					(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-					nx*ny*elemsize, nz, kind, *stream);
			
 
				+					nx*ny*elemsize, nz, kind, stream);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
@@ -502,7 +502,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				                                                   (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                                  nx*elemsize, ny, kind, *stream);
			
 
				+                                                  nx*elemsize, ny, kind, stream);
			
 
				 
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
@@ -557,12 +557,12 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -30,8 +30,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -385,7 +385,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
@@ -396,7 +396,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
			
 
				 			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-			cudaMemcpyDeviceToHost, *stream);
			
 
				+			cudaMemcpyDeviceToHost, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
@@ -415,7 +415,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
@@ -426,7 +426,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 
				 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				 				src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-				cudaMemcpyHostToDevice, *stream);
			
 
				+				cudaMemcpyHostToDevice, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_int
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -305,13 +305,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
				 
			
 
				 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				-					cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				+					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
			
 
				+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
@@ -330,13 +330,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 
			
 
				 
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -340,13 +340,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
				 
			
 
				 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				-					cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				+					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
			
 
				+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
@@ -364,13 +364,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 
			
 
				 
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -26,7 +26,7 @@
 
				 
			
 
				 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				+static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
			
@@ -141,7 +141,7 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 
				 				unsigned src_node __attribute__((unused)),
			
 
				 				void *dst_interface __attribute__((unused)),
			
 
				 				unsigned dst_node __attribute__((unused)),
			
 
				-				cudaStream_t *stream __attribute__ ((unused)))
			
 
				+				cudaStream_t stream __attribute__ ((unused)))
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -83,16 +83,17 @@ static void unlimit_gpu_mem_if_needed(int devid)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-cudaStream_t *starpu_cuda_get_local_stream(void)
			
 
				+cudaStream_t starpu_cuda_get_local_stream(void)
			
 
				 {
			
 
				 	int worker = starpu_worker_get_id();
			
 
				 
			
 
				-	return &streams[worker];
			
 
				+	return streams[worker];
			
 
				 }
			
 
				 
			
 
				 static void init_context(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				 
			
 
				 	cures = cudaSetDevice(devid);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
@@ -103,7 +104,7 @@ static void init_context(int devid)
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
 
				-	cures = cudaStreamCreate(starpu_cuda_get_local_stream());
			
 
				+	cures = cudaStreamCreate(&streams[workerid]);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 }
			
--- a/tests/datawizard/acquire_release_cuda.cu
+++ b/tests/datawizard/acquire_release_cuda.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void _increment_cuda_codelet(unsigned *val)
			
 
				 {
			
@@ -25,7 +26,7 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 
				 {
			
 
				 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-	_increment_cuda_codelet<<<1,1>>>(val);
			
 
				+	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/tests/datawizard/cuda_codelet_unsigned_inc.cu
+++ b/tests/datawizard/cuda_codelet_unsigned_inc.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void _cuda_unsigned_inc(unsigned *val)
			
 
				 {
			
@@ -25,7 +26,7 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 
				 {
			
 
				 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-	_cuda_unsigned_inc<<<1,1>>>(val);
			
 
				+	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -18,6 +18,7 @@
 
				 #include <unistd.h>
			
 
				 #include <errno.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include <stdlib.h>
			
 
				 
			
 
				 #define NLOOPS		1000
			
@@ -35,8 +36,8 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cudaMemset(buf, 42, length);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/scratch_cuda.cu
+++ b/tests/datawizard/scratch_cuda.cu
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 #define MAXNBLOCKS		32
			
 
				 #define MAXTHREADSPERBLOCK	128
			
@@ -45,6 +46,6 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 	unsigned nblocks = 128;
			
 
				 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
			
 
				 	
			
 
				-	increment_vector<<<nblocks, nthread_per_block>>>(v, tmp, nx);
			
 
				-	cudaThreadSynchronize();
			
 
				+	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/tests/datawizard/sync_and_notify_data_kernels.cu
+++ b/tests/datawizard/sync_and_notify_data_kernels.cu
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 /*
			
 
				  *	increment a (val[0])
			
@@ -29,9 +30,9 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 
				 {
			
 
				 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	_cuda_incA<<<1,1>>>(v);
			
 
				+	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -47,7 +48,7 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 
				 {
			
 
				 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	_cuda_incC<<<1,1>>>(v);
			
 
				+	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }