Procházet zdrojové kódy

- Make starpu_cuda_get_local_stream return the stream instead of a pointer to it
- Pass the stream to foo_to_bar_async instead of a pointer to it
- Make use of the local stream whenever possible
- Document the hint

Samuel Thibault před 14 roky
rodič
revize
eb10f6b655
33 změnil soubory, kde provedl 142 přidání a 96 odebrání
  1. 18 2
      doc/starpu.texi
  2. 2 2
      doc/vector_scal_cuda.texi
  3. 3 2
      examples/basic_examples/block_cuda.cu
  4. 3 1
      examples/basic_examples/variable_kernels.cu
  5. 3 2
      examples/basic_examples/vector_scal_cuda.cu
  6. 3 2
      examples/filters/fblock_cuda.cu
  7. 3 1
      examples/incrementer/incrementer_kernels.cu
  8. 3 1
      examples/pi/SobolQRNG/sobol_gpu.cu
  9. 5 4
      examples/pi/pi_kernel.cu
  10. 3 3
      examples/pi/pi_redux_kernel.cu
  11. 4 3
      examples/spmv/spmv_cuda.cu
  12. 8 8
      examples/starpufft/cudax_kernels.cu
  13. 1 0
      examples/starpufft/cudax_kernels.h
  14. 7 3
      examples/starpufft/starpufftx1d.c
  15. 7 3
      examples/starpufft/starpufftx2d.c
  16. 2 1
      examples/stencil/life_cuda.cu
  17. 2 1
      examples/stencil/shadow.cu
  18. 6 6
      examples/stencil/stencil-kernels.c
  19. 1 1
      include/starpu_cuda.h
  20. 3 3
      include/starpu_data_interfaces.h
  21. 3 1
      mpi/tests/ring_kernel.cu
  22. 3 3
      src/datawizard/copy_driver.c
  23. 8 8
      src/datawizard/interfaces/block_interface.c
  24. 6 6
      src/datawizard/interfaces/matrix_interface.c
  25. 6 6
      src/datawizard/interfaces/variable_interface.c
  26. 6 6
      src/datawizard/interfaces/vector_interface.c
  27. 2 2
      src/datawizard/interfaces/void_interface.c
  28. 4 3
      src/drivers/cuda/driver_cuda.c
  29. 3 2
      tests/datawizard/acquire_release_cuda.cu
  30. 3 2
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  31. 3 2
      tests/datawizard/data_invalidation.c
  32. 3 2
      tests/datawizard/scratch_cuda.cu
  33. 5 4
      tests/datawizard/sync_and_notify_data_kernels.cu

+ 18 - 2
doc/starpu.texi

@@ -798,9 +798,9 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
     unsigned threads_per_block = 64;
     unsigned threads_per_block = 64;
     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
-@i{    vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);}
+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
 
 
-@i{    cudaThreadSynchronize();}
+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
 @}
 @}
 @end smallexample
 @end smallexample
 @end cartouche
 @end cartouche
@@ -1278,6 +1278,7 @@ TODO: improve!
 * Task distribution vs Data transfer::
 * Task distribution vs Data transfer::
 * Power-based scheduling::
 * Power-based scheduling::
 * Profiling::
 * Profiling::
+* CUDA-specific optimizations::
 @end menu
 @end menu
 
 
 Simply encapsulating application kernels into tasks already permits to
 Simply encapsulating application kernels into tasks already permits to
@@ -1384,6 +1385,21 @@ execution did happen on accelerators without penalizing performance with
 the profiling overhead. More details on performance feedback are provided by the
 the profiling overhead. More details on performance feedback are provided by the
 next chapter.
 next chapter.
 
 
+@node CUDA-specific optimizations
+@section CUDA-specific optimizations
+
+Due to CUDA limitations, StarPU will have a hard time overlapping
+communications and computations if the application does not use a dedicated
+CUDA stream for its computations. StarPU provides one by the use of
+@code{starpu_cuda_get_local_stream()}. For instance:
+
+@example
+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaStreamSynchronize(starpu_cuda_get_local_stream());
+@end example
+
+Unfortunately, a lot of cuda libraries do not have stream variants of kernels.
+
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
 @c Performance feedback
 @c Performance feedback
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------

+ 2 - 2
doc/vector_scal_cuda.texi

@@ -19,7 +19,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned threads_per_block = 64;
         unsigned threads_per_block = 64;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
 
 
-        cudaThreadSynchronize();
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
 @}
 @}

+ 3 - 2
examples/basic_examples/block_cuda.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 {
 {
@@ -37,6 +38,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
         float *multiplier = (float *)_args;
 
 
-        cuda_block<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
-	cudaThreadSynchronize();
+        cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 1
examples/basic_examples/variable_kernels.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void cuda_variable(float * tab)
 static __global__ void cuda_variable(float * tab)
 {
 {
@@ -26,5 +27,6 @@ extern "C" void cuda_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 {
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-	cuda_variable<<<1,1>>>(val);
+	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 2
examples/basic_examples/vector_scal_cuda.cu

@@ -19,6 +19,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
                                         float factor)
@@ -40,7 +41,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 
-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, *factor);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 2
examples/filters/fblock_cuda.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
 {
 {
@@ -38,7 +39,7 @@ extern "C" void cuda_func(void *buffers[], void *_args)
         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
 
 
         /* TODO: use more blocks and threads in blocks */
         /* TODO: use more blocks and threads in blocks */
-        fblock_cuda<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 1
examples/incrementer/incrementer_kernels.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void cuda_incrementer(float * tab)
 static __global__ void cuda_incrementer(float * tab)
 {
 {
@@ -29,5 +30,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	(void)_args;
 	(void)_args;
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	cuda_incrementer<<<1,1>>>(val);
+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 1
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -39,6 +39,8 @@
 
 
 #include "sobol.h"
 #include "sobol.h"
 #include "sobol_gpu.h"
 #include "sobol_gpu.h"
+#include <starpu.h>
+#include <starpu_cuda.h>
 
 
 #define k_2powneg32 2.3283064E-10F
 #define k_2powneg32 2.3283064E-10F
 
 
@@ -164,5 +166,5 @@ void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float
     dimBlock.x = threadsperblock;
     dimBlock.x = threadsperblock;
 
 
     // Execute GPU kernel
     // Execute GPU kernel
-    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
 }
 }

+ 5 - 4
examples/pi/pi_kernel.cu

@@ -16,6 +16,7 @@
 
 
 #include "SobolQRNG/sobol_gpu.h"
 #include "SobolQRNG/sobol_gpu.h"
 #include "pi.h"
 #include "pi.h"
+#include <starpu_cuda.h>
 
 
 #define MAXNBLOCKS	128
 #define MAXNBLOCKS	128
 #define MAXTHREADSPERBLOCK	256
 #define MAXTHREADSPERBLOCK	256
@@ -109,7 +110,7 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	STARPU_ASSERT(random_numbers);
 	STARPU_ASSERT(random_numbers);
 	
 	
 	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
 	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 	TYPE *random_numbers_x = &random_numbers[0];
 	TYPE *random_numbers_x = &random_numbers[0];
 	TYPE *random_numbers_y = &random_numbers[nx];
 	TYPE *random_numbers_y = &random_numbers[nx];
@@ -132,14 +133,14 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
 
 	/* each entry of per_block_cnt contains the number of successful shots
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
 
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 
 	/* compute the total number of successful shots by adding the elements
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, cnt);
-	cures = cudaThreadSynchronize();
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 

+ 3 - 3
examples/pi/pi_redux_kernel.cu

@@ -113,14 +113,14 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
 
 	/* each entry of per_block_cnt contains the number of successful shots
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block>>>(x, y, n, per_block_cnt);
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
 
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 
 	/* compute the total number of successful shots by adding the elements
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, shot_cnt);
-	cures = cudaThreadSynchronize();
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 

+ 4 - 3
examples/spmv/spmv_cuda.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 #define MIN(a,b)	((a)<(b)?(a):(b))
 #define MIN(a,b)	((a)<(b)?(a):(b))
 
 
@@ -95,10 +96,10 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 	dim3 dimBlock(8, 1);
 	dim3 dimBlock(8, 1);
 	dim3 dimGrid(512, 1);
 	dim3 dimGrid(512, 1);
 
 
-	spmv_kernel_3<<<dimGrid, dimBlock>>>(nnz, nrow, nzval, colind, rowptr,
-						firstentry, vecin, nx_in, vecout, nx_out);
+	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
+		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
 }
 }
 
 

+ 8 - 8
examples/starpufft/cudax_kernels.cu

@@ -28,13 +28,13 @@
 \
 \
 	if (n < threads_per_block) { \
 	if (n < threads_per_block) { \
 		dim3 dimGrid(n); \
 		dim3 dimGrid(n); \
-		func <<<dimGrid, 1>>> args; \
+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
 	} else { \
 	} else { \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
-		func <<<dimGrid, dimBlock>>> args; \
+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 	} \
 	} \
-	cudaThreadSynchronize(); \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
 
 extern "C" __global__ void
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
@@ -83,24 +83,24 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 	if (n < threads_per_dim) { \
 	if (n < threads_per_dim) { \
 		if (m < threads_per_dim) { \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n, m); \
 			dim3 dimGrid(n, m); \
-			func <<<dimGrid, 1>>> args; \
+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
 		} else { \
 		} else { \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} \
 		} \
 	} else {  \
 	} else {  \
 		if (m < threads_per_dim) { \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimBlock(threads_per_dim, m); \
 			dim3 dimBlock(threads_per_dim, m); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} else { \
 		} else { \
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} \
 		} \
 	} \
 	} \
-	cudaThreadSynchronize(); \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
 
 extern "C" __global__ void
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)

+ 1 - 0
examples/starpufft/cudax_kernels.h

@@ -16,6 +16,7 @@
  */
  */
 
 
 #include <cuComplex.h>
 #include <cuComplex.h>
+#include <starpu_cuda.h>
 _externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
 _externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
 _externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
 _externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
 _externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
 _externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);

+ 7 - 3
examples/starpufft/starpufftx1d.c

@@ -72,7 +72,7 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 	
 	
 	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
 	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 /* fft1:
 /* fft1:
@@ -97,6 +97,8 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 
 	if (!plan->plans[workerid].initialized1) {
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
 
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
 		plan->plans[workerid].initialized1 = 1;
@@ -107,7 +109,7 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 
 	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
 	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 /* fft2:
 /* fft2:
@@ -132,6 +134,8 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
 
 	if (!plan->plans[workerid].initialized2) {
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
 
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
 		plan->plans[workerid].initialized2 = 1;
@@ -141,7 +145,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 7 - 3
examples/starpufft/starpufftx2d.c

@@ -41,7 +41,7 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 
 	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
 	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 /* Perform an n2,m2 fft */
 /* Perform an n2,m2 fft */
@@ -67,6 +67,8 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
 
 	if (!plan->plans[workerid].initialized1) {
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
 
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
 		plan->plans[workerid].initialized1 = 1;
@@ -78,7 +80,7 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 	/* synchronization is done after the twiddling */
 	/* synchronization is done after the twiddling */
 	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
 	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 static void
 static void
@@ -104,6 +106,8 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
 
 	if (!plan->plans[workerid].initialized2) {
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
 
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
 		plan->plans[workerid].initialized2 = 1;
@@ -114,7 +118,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	}
 	}
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 2 - 1
examples/stencil/life_cuda.cu

@@ -16,6 +16,7 @@
 
 
 #define _externC extern "C"
 #define _externC extern "C"
 #include "stencil.h"
 #include "stencil.h"
+#include <starpu_cuda.h>
 
 
 /* Heart of the stencil computation: compute a new state from an old one. */
 /* Heart of the stencil computation: compute a new state from an old one. */
 
 
@@ -72,5 +73,5 @@ cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int n
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 #endif
-	cuda_life_update <<<dimGrid, dimBlock>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
 }
 }

+ 2 - 1
examples/stencil/shadow.cu

@@ -16,6 +16,7 @@
 
 
 #define _externC extern "C"
 #define _externC extern "C"
 #include "stencil.h"
 #include "stencil.h"
+#include <starpu_cuda.h>
 
 
 /* Perform replication of data on X and Y edges, to fold the domain on 
 /* Perform replication of data on X and Y edges, to fold the domain on 
    itself through mere replication of the source state. */
    itself through mere replication of the source state. */
@@ -54,5 +55,5 @@ cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, in
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 #endif
-	cuda_shadow <<<dimGrid, dimBlock>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
 }
 }

+ 6 - 6
examples/stencil/stencil-kernels.c

@@ -173,7 +173,7 @@ static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	unsigned offset = firstz*block->ldz;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
-	cudaMemcpy(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 }
 }
 
 
 /*
 /*
@@ -243,12 +243,12 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 #ifdef LIFE
 #ifdef LIFE
 		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
 #else
-		cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
+		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 #endif /* LIFE */
 	}
 	}
 
 
 	cudaError_t cures;
 	cudaError_t cures;
-	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
+	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
 }
 }
@@ -407,7 +407,7 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	unsigned offset = firstz*block->ldz;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
-	cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 }
 }
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
 
 
@@ -459,7 +459,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 /* bottom save, CUDA version */
 /* bottom save, CUDA version */
@@ -473,7 +473,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
 
 

+ 1 - 1
include/starpu_cuda.h

@@ -79,7 +79,7 @@ extern "C" {
 		STARPU_CUBLAS_OOPS();					\
 		STARPU_CUBLAS_OOPS();					\
 	} while (0)
 	} while (0)
 
 
-cudaStream_t *starpu_cuda_get_local_stream(void);
+cudaStream_t starpu_cuda_get_local_stream(void);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 3 - 3
include/starpu_data_interfaces.h

@@ -66,9 +66,9 @@ struct starpu_data_copy_methods {
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	/* for asynchronous CUDA transfers */
-	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL

+ 3 - 1
mpi/tests/ring_kernel.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void cuda_incrementer(unsigned *token)
 static __global__ void cuda_incrementer(unsigned *token)
 {
 {
@@ -26,5 +27,6 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	(void) _args;
 	(void) _args;
 	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	cuda_incrementer<<<1,1>>>(tokenptr);
+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 3
src/datawizard/copy_driver.c

@@ -102,7 +102,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	cudaError_t cures;
 	cudaError_t cures;
-	cudaStream_t *stream;
+	cudaStream_t stream;
 #endif
 #endif
 
 
 	void *src_interface = src_replicate->interface;
 	void *src_interface = src_replicate->interface;
@@ -132,7 +132,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 				stream = starpu_cuda_get_local_stream();
 				stream = starpu_cuda_get_local_stream();
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 
-				cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
+				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
 				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			}
 			}
 		}
 		}
@@ -157,7 +157,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			stream = starpu_cuda_get_local_stream();
 			stream = starpu_cuda_get_local_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 
-			cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
+			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		}
 		break;
 		break;

+ 8 - 8
src/datawizard/interfaces/block_interface.c

@@ -31,8 +31,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -435,7 +435,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 	return 0;
 }
 }
 
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -457,7 +457,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
 		{
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, kind, *stream);
+					nx*ny*nz*elemsize, kind, stream);
 			if (STARPU_UNLIKELY(cures))
 			if (STARPU_UNLIKELY(cures))
 			{
 			{
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
@@ -476,7 +476,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 			/* Are all plans contiguous */
 			/* Are all plans contiguous */
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
-					nx*ny*elemsize, nz, kind, *stream);
+					nx*ny*elemsize, nz, kind, stream);
 			if (STARPU_UNLIKELY(cures))
 			if (STARPU_UNLIKELY(cures))
 			{
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -502,7 +502,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
-                                                  nx*elemsize, ny, kind, *stream);
+                                                  nx*elemsize, ny, kind, stream);
 
 
 			if (STARPU_UNLIKELY(cures))
 			if (STARPU_UNLIKELY(cures))
 			{
 			{
@@ -557,12 +557,12 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 }
 
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 }
 
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 }

+ 6 - 6
src/datawizard/interfaces/matrix_interface.c

@@ -30,8 +30,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -385,7 +385,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 }
 
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -396,7 +396,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
 			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
 			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
-			cudaMemcpyDeviceToHost, *stream);
+			cudaMemcpyDeviceToHost, stream);
 	if (cures)
 	if (cures)
 	{
 	{
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
@@ -415,7 +415,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 	return -EAGAIN;
 	return -EAGAIN;
 }
 }
 
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -426,7 +426,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 				src_matrix->nx*elemsize, src_matrix->ny,
 				src_matrix->nx*elemsize, src_matrix->ny,
-				cudaMemcpyHostToDevice, *stream);
+				cudaMemcpyHostToDevice, stream);
 	if (cures)
 	if (cures)
 	{
 	{
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,

+ 6 - 6
src/datawizard/interfaces/variable_interface.c

@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_int
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -305,13 +305,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
-					cudaStream_t *stream, enum cudaMemcpyKind kind)
+					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
 
 
 	cudaError_t cures;
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
 	if (cures)
 	if (cures)
 	{
 	{
 		/* do it in a synchronous fashion */
 		/* do it in a synchronous fashion */
@@ -330,13 +330,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 
 
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 }
 
 
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 }

+ 6 - 6
src/datawizard/interfaces/vector_interface.c

@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -340,13 +340,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
-					cudaStream_t *stream, enum cudaMemcpyKind kind)
+					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 
 	cudaError_t cures;
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
 	if (cures)
 	if (cures)
 	{
 	{
 		/* do it in a synchronous fashion */
 		/* do it in a synchronous fashion */
@@ -364,13 +364,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 
 
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 }
 
 
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 }

+ 2 - 2
src/datawizard/interfaces/void_interface.c

@@ -26,7 +26,7 @@
 
 
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
@@ -141,7 +141,7 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 				unsigned src_node __attribute__((unused)),
 				unsigned src_node __attribute__((unused)),
 				void *dst_interface __attribute__((unused)),
 				void *dst_interface __attribute__((unused)),
 				unsigned dst_node __attribute__((unused)),
 				unsigned dst_node __attribute__((unused)),
-				cudaStream_t *stream __attribute__ ((unused)))
+				cudaStream_t stream __attribute__ ((unused)))
 {
 {
 	return 0;
 	return 0;
 }
 }

+ 4 - 3
src/drivers/cuda/driver_cuda.c

@@ -83,16 +83,17 @@ static void unlimit_gpu_mem_if_needed(int devid)
 	}
 	}
 }
 }
 
 
-cudaStream_t *starpu_cuda_get_local_stream(void)
+cudaStream_t starpu_cuda_get_local_stream(void)
 {
 {
 	int worker = starpu_worker_get_id();
 	int worker = starpu_worker_get_id();
 
 
-	return &streams[worker];
+	return streams[worker];
 }
 }
 
 
 static void init_context(int devid)
 static void init_context(int devid)
 {
 {
 	cudaError_t cures;
 	cudaError_t cures;
+	int workerid = starpu_worker_get_id();
 
 
 	cures = cudaSetDevice(devid);
 	cures = cudaSetDevice(devid);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
@@ -103,7 +104,7 @@ static void init_context(int devid)
 
 
 	limit_gpu_mem_if_needed(devid);
 	limit_gpu_mem_if_needed(devid);
 
 
-	cures = cudaStreamCreate(starpu_cuda_get_local_stream());
+	cures = cudaStreamCreate(&streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 }

+ 3 - 2
tests/datawizard/acquire_release_cuda.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void _increment_cuda_codelet(unsigned *val)
 static __global__ void _increment_cuda_codelet(unsigned *val)
 {
 {
@@ -25,7 +26,7 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 {
 {
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-	_increment_cuda_codelet<<<1,1>>>(val);
+	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 2
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void _cuda_unsigned_inc(unsigned *val)
 static __global__ void _cuda_unsigned_inc(unsigned *val)
 {
 {
@@ -25,7 +26,7 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 {
 {
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-	_cuda_unsigned_inc<<<1,1>>>(val);
+	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 3 - 2
tests/datawizard/data_invalidation.c

@@ -18,6 +18,7 @@
 #include <unistd.h>
 #include <unistd.h>
 #include <errno.h>
 #include <errno.h>
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 #include <stdlib.h>
 #include <stdlib.h>
 
 
 #define NLOOPS		1000
 #define NLOOPS		1000
@@ -35,8 +36,8 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	cudaMemset(buf, 42, length);
-	cudaThreadSynchronize();
+	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 3 - 2
tests/datawizard/scratch_cuda.cu

@@ -16,6 +16,7 @@
 
 
 #include <stdio.h>
 #include <stdio.h>
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 #define MAXNBLOCKS		32
 #define MAXNBLOCKS		32
 #define MAXTHREADSPERBLOCK	128
 #define MAXTHREADSPERBLOCK	128
@@ -45,6 +46,6 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	unsigned nblocks = 128;
 	unsigned nblocks = 128;
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 	
 	
-	increment_vector<<<nblocks, nthread_per_block>>>(v, tmp, nx);
-	cudaThreadSynchronize();
+	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }

+ 5 - 4
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 /*
 /*
  *	increment a (val[0])
  *	increment a (val[0])
@@ -29,9 +30,9 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 {
 {
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	_cuda_incA<<<1,1>>>(v);
+	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 
 
 /*
 /*
@@ -47,7 +48,7 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 {
 {
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	_cuda_incC<<<1,1>>>(v);
+	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
 
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }