14 years ago · eb10f6b655
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -798,9 +798,9 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																     unsigned threads_per_block = 64;
															
 
																     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																-@i{    vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);}
															
 
																+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
															
 
																-@i{    cudaThreadSynchronize();}
															
 
																+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
															
 
																 @}
															
 
																 @end smallexample
															
 
																 @end cartouche
															
@@ -1278,6 +1278,7 @@ TODO: improve!
 
																 * Task distribution vs Data transfer::
															
 
																 * Power-based scheduling::
															
 
																 * Profiling::
															
 
																+* CUDA-specific optimizations::
															
 
																 @end menu
															
 
																 Simply encapsulating application kernels into tasks already permits to
															
@@ -1384,6 +1385,21 @@ execution did happen on accelerators without penalizing performance with
 
																 the profiling overhead. More details on performance feedback are provided by the
															
 
																 next chapter.
															
 
																+@node CUDA-specific optimizations
															
 
																+@section CUDA-specific optimizations
															
 
																+
															
 
																+Due to CUDA limitations, StarPU will have a hard time overlapping
															
 
																+communications and computations if the application does not use a dedicated
															
 
																+CUDA stream for its computations. StarPU provides one by the use of
															
 
																+@code{starpu_cuda_get_local_stream()}. For instance:
															
 
																+
															
 
																+@example
															
 
																+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
															
 
																+cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+@end example
															
 
																+
															
 
																+Unfortunately, a lot of cuda libraries do not have stream variants of kernels.
															
 
																+
															
 
																 @c ---------------------------------------------------------------------
															
 
																 @c Performance feedback
															
 
																 @c ---------------------------------------------------------------------
															
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -19,7 +19,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																         unsigned threads_per_block = 64;
															
 
																         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
															
 
																+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
															
 
																-        cudaThreadSynchronize();
															
 
																+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 @}
															
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
															
 
																 {
															
@@ -37,6 +38,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
															
 
																         float *multiplier = (float *)_args;
															
 
																-        cuda_block<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
															
 
																-	cudaThreadSynchronize();
															
 
																+        cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void cuda_variable(float * tab)
															
 
																 {
															
@@ -26,5 +27,6 @@ extern "C" void cuda_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
																 {
															
 
																 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-	cuda_variable<<<1,1>>>(val);
															
 
																+	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -19,6 +19,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void vector_mult_cuda(float *val, unsigned n,
															
 
																                                         float factor)
															
@@ -40,7 +41,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																 	unsigned threads_per_block = 64;
															
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
															
 
																+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, *factor);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
															
 
																 {
															
@@ -38,7 +39,7 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
																         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
															
 
																         /* TODO: use more blocks and threads in blocks */
															
 
																-        fblock_cuda<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *factor);
															
 
																+        fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void cuda_incrementer(float * tab)
															
 
																 {
															
@@ -29,5 +30,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																 	(void)_args;
															
 
																 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	cuda_incrementer<<<1,1>>>(val);
															
 
																+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -39,6 +39,8 @@
 
																 #include "sobol.h"
															
 
																 #include "sobol_gpu.h"
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 #define k_2powneg32 2.3283064E-10F
															
@@ -164,5 +166,5 @@ void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float
 
																     dimBlock.x = threadsperblock;
															
 
																     // Execute GPU kernel
															
 
																-    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
															
 
																+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
															
 
																 }
															
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -16,6 +16,7 @@
 
																 #include "SobolQRNG/sobol_gpu.h"
															
 
																 #include "pi.h"
															
 
																+#include <starpu_cuda.h>
															
 
																 #define MAXNBLOCKS	128
															
 
																 #define MAXTHREADSPERBLOCK	256
															
@@ -109,7 +110,7 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
																 	STARPU_ASSERT(random_numbers);
															
 
																 	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	TYPE *random_numbers_x = &random_numbers[0];
															
 
																 	TYPE *random_numbers_y = &random_numbers[nx];
															
@@ -132,14 +133,14 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
																 	/* each entry of per_block_cnt contains the number of successful shots
															
 
																 	 * in the corresponding block. */
															
 
																-	monte_carlo<<<nblocks, nthread_per_block>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
															
 
																+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
															
 
																 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																 	/* compute the total number of successful shots by adding the elements
															
 
																 	 * of the per_block_cnt array */
															
 
																-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, cnt);
															
 
																-	cures = cudaThreadSynchronize();
															
 
																+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
															
 
																+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	if (cures)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -113,14 +113,14 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
																 	/* each entry of per_block_cnt contains the number of successful shots
															
 
																 	 * in the corresponding block. */
															
 
																-	monte_carlo<<<nblocks, nthread_per_block>>>(x, y, n, per_block_cnt);
															
 
																+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
															
 
																 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																 	/* compute the total number of successful shots by adding the elements
															
 
																 	 * of the per_block_cnt array */
															
 
																-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, shot_cnt);
															
 
																-	cures = cudaThreadSynchronize();
															
 
																+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
															
 
																+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	if (cures)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/examples/spmv/spmv_cuda.cu
+++ b/examples/spmv/spmv_cuda.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 #define MIN(a,b)	((a)<(b)?(a):(b))
															
@@ -95,10 +96,10 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
																 	dim3 dimBlock(8, 1);
															
 
																 	dim3 dimGrid(512, 1);
															
 
																-	spmv_kernel_3<<<dimGrid, dimBlock>>>(nnz, nrow, nzval, colind, rowptr,
															
 
																-						firstentry, vecin, nx_in, vecout, nx_out);
															
 
																+	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
															
 
																+		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/starpufft/cudax_kernels.cu
+++ b/examples/starpufft/cudax_kernels.cu
@@ -28,13 +28,13 @@
 
																 \
															
 
																 	if (n < threads_per_block) { \
															
 
																 		dim3 dimGrid(n); \
															
 
																-		func <<<dimGrid, 1>>> args; \
															
 
																+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 	} else { \
															
 
																 		dim3 dimGrid(n / threads_per_block); \
															
 
																 		dim3 dimBlock(threads_per_block); \
															
 
																-		func <<<dimGrid, dimBlock>>> args; \
															
 
																+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 	} \
															
 
																-	cudaThreadSynchronize(); \
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
															
 
																 extern "C" __global__ void
															
 
																 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
															
@@ -83,24 +83,24 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 
																 	if (n < threads_per_dim) { \
															
 
																 		if (m < threads_per_dim) { \
															
 
																 			dim3 dimGrid(n, m); \
															
 
																-			func <<<dimGrid, 1>>> args; \
															
 
																+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 		} else { \
															
 
																 			dim3 dimGrid(1, m / threads_per_dim); \
															
 
																 			dim3 dimBlock(n, threads_per_dim); \
															
 
																-			func <<<dimGrid, dimBlock>>> args; \
															
 
																+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 		} \
															
 
																 	} else {  \
															
 
																 		if (m < threads_per_dim) { \
															
 
																 			dim3 dimGrid(n / threads_per_dim, 1); \
															
 
																 			dim3 dimBlock(threads_per_dim, m); \
															
 
																-			func <<<dimGrid, dimBlock>>> args; \
															
 
																+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 		} else { \
															
 
																 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
															
 
																 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
															
 
																-			func <<<dimGrid, dimBlock>>> args; \
															
 
																+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
															
 
																 		} \
															
 
																 	} \
															
 
																-	cudaThreadSynchronize(); \
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
															
 
																 extern "C" __global__ void
															
 
																 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
															
--- a/examples/starpufft/cudax_kernels.h
+++ b/examples/starpufft/cudax_kernels.h
@@ -16,6 +16,7 @@
 
																  */
															
 
																 #include <cuComplex.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 _externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
															
 
																 _externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
															
 
																 _externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
															
--- a/examples/starpufft/starpufftx1d.c
+++ b/examples/starpufft/starpufftx1d.c
@@ -72,7 +72,7 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /* fft1:
															
@@ -97,6 +97,8 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized1) {
															
 
																 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
															
 
																+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized1 = 1;
															
@@ -107,7 +109,7 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /* fft2:
															
@@ -132,6 +134,8 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized2) {
															
 
																 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
															
 
																+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized2 = 1;
															
@@ -141,7 +145,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
																 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
															
 
																 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
--- a/examples/starpufft/starpufftx2d.c
+++ b/examples/starpufft/starpufftx2d.c
@@ -41,7 +41,7 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																 	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /* Perform an n2,m2 fft */
															
@@ -67,6 +67,8 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized1) {
															
 
																 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
															
 
																+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized1 = 1;
															
@@ -78,7 +80,7 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	/* synchronization is done after the twiddling */
															
 
																 	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 static void
															
@@ -104,6 +106,8 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
																 	if (!plan->plans[workerid].initialized2) {
															
 
																 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
															
 
																+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
															
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 		plan->plans[workerid].initialized2 = 1;
															
@@ -114,7 +118,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
																 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
															
 
																 	}
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
--- a/examples/stencil/life_cuda.cu
+++ b/examples/stencil/life_cuda.cu
@@ -16,6 +16,7 @@
 
																 #define _externC extern "C"
															
 
																 #include "stencil.h"
															
 
																+#include <starpu_cuda.h>
															
 
																 /* Heart of the stencil computation: compute a new state from an old one. */
															
@@ -72,5 +73,5 @@ cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int n
 
																 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
															
 
																 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
															
 
																 #endif
															
 
																-	cuda_life_update <<<dimGrid, dimBlock>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
															
 
																+	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
															
 
																 }
															
--- a/examples/stencil/shadow.cu
+++ b/examples/stencil/shadow.cu
@@ -16,6 +16,7 @@
 
																 #define _externC extern "C"
															
 
																 #include "stencil.h"
															
 
																+#include <starpu_cuda.h>
															
 
																 /* Perform replication of data on X and Y edges, to fold the domain on 
															
 
																    itself through mere replication of the source state. */
															
@@ -54,5 +55,5 @@ cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, in
 
																 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
															
 
																 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
															
 
																 #endif
															
 
																-	cuda_shadow <<<dimGrid, dimBlock>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
															
 
																+	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
															
 
																 }
															
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -173,7 +173,7 @@ static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
 
																 	unsigned offset = firstz*block->ldz;
															
 
																 	TYPE *block_data = (TYPE *)block->ptr;
															
 
																 	TYPE *boundary_data = (TYPE *)boundary->ptr;
															
 
																-	cudaMemcpy(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice);
															
 
																+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /*
															
@@ -243,12 +243,12 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 
																 #ifdef LIFE
															
 
																 		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
															
 
																 #else
															
 
																-		cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
															
 
																+		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																 #endif /* LIFE */
															
 
																 	}
															
 
																 	cudaError_t cures;
															
 
																-	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
															
 
																+	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 }
															
@@ -407,7 +407,7 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 
																 	unsigned offset = firstz*block->ldz;
															
 
																 	TYPE *block_data = (TYPE *)block->ptr;
															
 
																 	TYPE *boundary_data = (TYPE *)boundary->ptr;
															
 
																-	cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
															
 
																+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif /* STARPU_USE_CUDA */
															
@@ -459,7 +459,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
																 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
															
 
																 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /* bottom save, CUDA version */
															
@@ -473,7 +473,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
																 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
															
 
																 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif /* STARPU_USE_CUDA */
															
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -79,7 +79,7 @@ extern "C" {
 
																 		STARPU_CUBLAS_OOPS();					\
															
 
																 	} while (0)
															
 
																-cudaStream_t *starpu_cuda_get_local_stream(void);
															
 
																+cudaStream_t starpu_cuda_get_local_stream(void);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -66,9 +66,9 @@ struct starpu_data_copy_methods {
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	/* for asynchronous CUDA transfers */
															
 
																-	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																-	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																-	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																+	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																+	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																+	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void cuda_incrementer(unsigned *token)
															
 
																 {
															
@@ -26,5 +27,6 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 
																 	(void) _args;
															
 
																 	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	cuda_incrementer<<<1,1>>>(tokenptr);
															
 
																+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -102,7 +102,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	cudaError_t cures;
															
 
																-	cudaStream_t *stream;
															
 
																+	cudaStream_t stream;
															
 
																 #endif
															
 
																 	void *src_interface = src_replicate->interface;
															
@@ -132,7 +132,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
																 				stream = starpu_cuda_get_local_stream();
															
 
																 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																-				cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
															
 
																+				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
															
 
																 				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 			}
															
 
																 		}
															
@@ -157,7 +157,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
																 			stream = starpu_cuda_get_local_stream();
															
 
																 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																-			cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
															
 
																+			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
															
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 		}
															
 
																 		break;
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -31,8 +31,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -435,7 +435,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
																 	return 0;
															
 
																 }
															
 
																-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
															
 
																+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
															
 
																 {
															
 
																 	starpu_block_interface_t *src_block = src_interface;
															
 
																 	starpu_block_interface_t *dst_block = dst_interface;
															
@@ -457,7 +457,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
																 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
															
 
																 		{
															
 
																 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
															
 
																-					nx*ny*nz*elemsize, kind, *stream);
															
 
																+					nx*ny*nz*elemsize, kind, stream);
															
 
																 			if (STARPU_UNLIKELY(cures))
															
 
																 			{
															
 
																 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
															
@@ -476,7 +476,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
																 			/* Are all plans contiguous */
															
 
																 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
															
 
																 					(char *)src_block->ptr, src_block->ldz*elemsize,
															
 
																-					nx*ny*elemsize, nz, kind, *stream);
															
 
																+					nx*ny*elemsize, nz, kind, stream);
															
 
																 			if (STARPU_UNLIKELY(cures))
															
 
																 			{
															
 
																 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
															
@@ -502,7 +502,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
																 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
															
 
																                                                   (char *)src_ptr, src_block->ldy*elemsize,
															
 
																-                                                  nx*elemsize, ny, kind, *stream);
															
 
																+                                                  nx*elemsize, ny, kind, stream);
															
 
																 			if (STARPU_UNLIKELY(cures))
															
 
																 			{
															
@@ -557,12 +557,12 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
																 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
															
 
																 }
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
															
 
																 }
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
															
 
																 }
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -30,8 +30,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
																 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
@@ -385,7 +385,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
																 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
															
 
																 }
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	starpu_matrix_interface_t *src_matrix = src_interface;
															
 
																 	starpu_matrix_interface_t *dst_matrix = dst_interface;
															
@@ -396,7 +396,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
																 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
															
 
																 			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
															
 
																 			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
															
 
																-			cudaMemcpyDeviceToHost, *stream);
															
 
																+			cudaMemcpyDeviceToHost, stream);
															
 
																 	if (cures)
															
 
																 	{
															
 
																 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
															
@@ -415,7 +415,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
																 	return -EAGAIN;
															
 
																 }
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	starpu_matrix_interface_t *src_matrix = src_interface;
															
 
																 	starpu_matrix_interface_t *dst_matrix = dst_interface;
															
@@ -426,7 +426,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 
																 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
															
 
																 				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
															
 
																 				src_matrix->nx*elemsize, src_matrix->ny,
															
 
																-				cudaMemcpyHostToDevice, *stream);
															
 
																+				cudaMemcpyHostToDevice, stream);
															
 
																 	if (cures)
															
 
																 	{
															
 
																 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_int
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
															
 
																 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -305,13 +305,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
																 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																 					void *dst_interface, unsigned dst_node __attribute__((unused)),
															
 
																-					cudaStream_t *stream, enum cudaMemcpyKind kind)
															
 
																+					cudaStream_t stream, enum cudaMemcpyKind kind)
															
 
																 {
															
 
																 	starpu_variable_interface_t *src_variable = src_interface;
															
 
																 	starpu_variable_interface_t *dst_variable = dst_interface;
															
 
																 	cudaError_t cures;
															
 
																-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
															
 
																+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
															
 
																 	if (cures)
															
 
																 	{
															
 
																 		/* do it in a synchronous fashion */
															
@@ -330,13 +330,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
																 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
															
 
																 }
															
 
																 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
															
 
																 }
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
															
 
																 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -340,13 +340,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
																 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																 					void *dst_interface, unsigned dst_node __attribute__((unused)),
															
 
																-					cudaStream_t *stream, enum cudaMemcpyKind kind)
															
 
																+					cudaStream_t stream, enum cudaMemcpyKind kind)
															
 
																 {
															
 
																 	starpu_vector_interface_t *src_vector = src_interface;
															
 
																 	starpu_vector_interface_t *dst_vector = dst_interface;
															
 
																 	cudaError_t cures;
															
 
																-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
															
 
																+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
															
 
																 	if (cures)
															
 
																 	{
															
 
																 		/* do it in a synchronous fashion */
															
@@ -364,13 +364,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
																 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
															
 
																 }
															
 
																 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
															
 
																-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
															
 
																+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
															
 
																 {
															
 
																 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
															
 
																 }
															
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -26,7 +26,7 @@
 
																 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
															
 
																+static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
															
@@ -141,7 +141,7 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 
																 				unsigned src_node __attribute__((unused)),
															
 
																 				void *dst_interface __attribute__((unused)),
															
 
																 				unsigned dst_node __attribute__((unused)),
															
 
																-				cudaStream_t *stream __attribute__ ((unused)))
															
 
																+				cudaStream_t stream __attribute__ ((unused)))
															
 
																 {
															
 
																 	return 0;
															
 
																 }
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -83,16 +83,17 @@ static void unlimit_gpu_mem_if_needed(int devid)
 
																 	}
															
 
																 }
															
 
																-cudaStream_t *starpu_cuda_get_local_stream(void)
															
 
																+cudaStream_t starpu_cuda_get_local_stream(void)
															
 
																 {
															
 
																 	int worker = starpu_worker_get_id();
															
 
																-	return &streams[worker];
															
 
																+	return streams[worker];
															
 
																 }
															
 
																 static void init_context(int devid)
															
 
																 {
															
 
																 	cudaError_t cures;
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																 	cures = cudaSetDevice(devid);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
@@ -103,7 +104,7 @@ static void init_context(int devid)
 
																 	limit_gpu_mem_if_needed(devid);
															
 
																-	cures = cudaStreamCreate(starpu_cuda_get_local_stream());
															
 
																+	cures = cudaStreamCreate(&streams[workerid]);
															
 
																 	if (STARPU_UNLIKELY(cures))
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 }
															
--- a/tests/datawizard/acquire_release_cuda.cu
+++ b/tests/datawizard/acquire_release_cuda.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void _increment_cuda_codelet(unsigned *val)
															
 
																 {
															
@@ -25,7 +26,7 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 
																 {
															
 
																 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-	_increment_cuda_codelet<<<1,1>>>(val);
															
 
																+	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/tests/datawizard/cuda_codelet_unsigned_inc.cu
+++ b/tests/datawizard/cuda_codelet_unsigned_inc.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void _cuda_unsigned_inc(unsigned *val)
															
 
																 {
															
@@ -25,7 +26,7 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 
																 {
															
 
																 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-	_cuda_unsigned_inc<<<1,1>>>(val);
															
 
																+	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -18,6 +18,7 @@
 
																 #include <unistd.h>
															
 
																 #include <errno.h>
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 #include <stdlib.h>
															
 
																 #define NLOOPS		1000
															
@@ -35,8 +36,8 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 
																 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	cudaMemset(buf, 42, length);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
--- a/tests/datawizard/scratch_cuda.cu
+++ b/tests/datawizard/scratch_cuda.cu
@@ -16,6 +16,7 @@
 
																 #include <stdio.h>
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 #define MAXNBLOCKS		32
															
 
																 #define MAXTHREADSPERBLOCK	128
															
@@ -45,6 +46,6 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
																 	unsigned nblocks = 128;
															
 
																 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
															
 
																-	increment_vector<<<nblocks, nthread_per_block>>>(v, tmp, nx);
															
 
																-	cudaThreadSynchronize();
															
 
																+	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/tests/datawizard/sync_and_notify_data_kernels.cu
+++ b/tests/datawizard/sync_and_notify_data_kernels.cu
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 /*
															
 
																  *	increment a (val[0])
															
@@ -29,9 +30,9 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 
																 {
															
 
																 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	_cuda_incA<<<1,1>>>(v);
															
 
																+	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 /*
															
@@ -47,7 +48,7 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 
																 {
															
 
																 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	_cuda_incC<<<1,1>>>(v);
															
 
																+	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }