Browse Source

- Make starpu_cuda_get_local_stream return the stream instead of a pointer to it
- Pass the stream to foo_to_bar_async instead of a pointer to it
- Make use of the local stream whenever possible
- Document the hint

Samuel Thibault 14 years ago
parent
commit
eb10f6b655

+ 18 - 2
doc/starpu.texi

@@ -798,9 +798,9 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
     unsigned threads_per_block = 64;
     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-@i{    vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);}
+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
 
-@i{    cudaThreadSynchronize();}
+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
 @}
 @end smallexample
 @end cartouche
@@ -1278,6 +1278,7 @@ TODO: improve!
 * Task distribution vs Data transfer::
 * Power-based scheduling::
 * Profiling::
+* CUDA-specific optimizations::
 @end menu
 
 Simply encapsulating application kernels into tasks already permits to
@@ -1384,6 +1385,21 @@ execution did happen on accelerators without penalizing performance with
 the profiling overhead. More details on performance feedback are provided by the
 next chapter.
 
+@node CUDA-specific optimizations
+@section CUDA-specific optimizations
+
+Due to CUDA limitations, StarPU will have a hard time overlapping
+communications and computations if the application does not use a dedicated
+CUDA stream for its computations. StarPU provides one by the use of
+@code{starpu_cuda_get_local_stream()}. For instance:
+
+@example
+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaStreamSynchronize(starpu_cuda_get_local_stream());
+@end example
+
+Unfortunately, a lot of cuda libraries do not have stream variants of kernels.
+
 @c ---------------------------------------------------------------------
 @c Performance feedback
 @c ---------------------------------------------------------------------

+ 2 - 2
doc/vector_scal_cuda.texi

@@ -19,7 +19,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned threads_per_block = 64;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
 
-        cudaThreadSynchronize();
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
 @}

+ 3 - 2
examples/basic_examples/block_cuda.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 {
@@ -37,6 +38,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
 
-        cuda_block<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
-	cudaThreadSynchronize();
+        cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 1
examples/basic_examples/variable_kernels.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void cuda_variable(float * tab)
 {
@@ -26,5 +27,6 @@ extern "C" void cuda_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-	cuda_variable<<<1,1>>>(val);
+	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 2
examples/basic_examples/vector_scal_cuda.cu

@@ -19,6 +19,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
@@ -40,7 +41,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, *factor);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 2
examples/filters/fblock_cuda.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
 {
@@ -38,7 +39,7 @@ extern "C" void cuda_func(void *buffers[], void *_args)
         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
 
         /* TODO: use more blocks and threads in blocks */
-        fblock_cuda<<<1,1>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 1
examples/incrementer/incrementer_kernels.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void cuda_incrementer(float * tab)
 {
@@ -29,5 +30,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	(void)_args;
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	cuda_incrementer<<<1,1>>>(val);
+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 1
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -39,6 +39,8 @@
 
 #include "sobol.h"
 #include "sobol_gpu.h"
+#include <starpu.h>
+#include <starpu_cuda.h>
 
 #define k_2powneg32 2.3283064E-10F
 
@@ -164,5 +166,5 @@ void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float
     dimBlock.x = threadsperblock;
 
     // Execute GPU kernel
-    sobolGPU_kernel<<<dimGrid, dimBlock>>>(n_vectors, n_dimensions, d_directions, d_output);
+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
 }

+ 5 - 4
examples/pi/pi_kernel.cu

@@ -16,6 +16,7 @@
 
 #include "SobolQRNG/sobol_gpu.h"
 #include "pi.h"
+#include <starpu_cuda.h>
 
 #define MAXNBLOCKS	128
 #define MAXTHREADSPERBLOCK	256
@@ -109,7 +110,7 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	STARPU_ASSERT(random_numbers);
 	
 	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	TYPE *random_numbers_x = &random_numbers[0];
 	TYPE *random_numbers_y = &random_numbers[nx];
@@ -132,14 +133,14 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, cnt);
-	cures = cudaThreadSynchronize();
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 

+ 3 - 3
examples/pi/pi_redux_kernel.cu

@@ -113,14 +113,14 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
-	monte_carlo<<<nblocks, nthread_per_block>>>(x, y, n, per_block_cnt);
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
-	sum_per_block_cnt<<<1, nblocks>>>(per_block_cnt, shot_cnt);
-	cures = cudaThreadSynchronize();
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);
 

+ 4 - 3
examples/spmv/spmv_cuda.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 #define MIN(a,b)	((a)<(b)?(a):(b))
 
@@ -95,10 +96,10 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 	dim3 dimBlock(8, 1);
 	dim3 dimGrid(512, 1);
 
-	spmv_kernel_3<<<dimGrid, dimBlock>>>(nnz, nrow, nzval, colind, rowptr,
-						firstentry, vecin, nx_in, vecout, nx_out);
+	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
+		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 }
 

+ 8 - 8
examples/starpufft/cudax_kernels.cu

@@ -28,13 +28,13 @@
 \
 	if (n < threads_per_block) { \
 		dim3 dimGrid(n); \
-		func <<<dimGrid, 1>>> args; \
+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
 	} else { \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
-		func <<<dimGrid, dimBlock>>> args; \
+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 	} \
-	cudaThreadSynchronize(); \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
@@ -83,24 +83,24 @@ STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsign
 	if (n < threads_per_dim) { \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n, m); \
-			func <<<dimGrid, 1>>> args; \
+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
 		} else { \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} \
 	} else {  \
 		if (m < threads_per_dim) { \
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimBlock(threads_per_dim, m); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} else { \
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
-			func <<<dimGrid, dimBlock>>> args; \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
 		} \
 	} \
-	cudaThreadSynchronize(); \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
 extern "C" __global__ void
 STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)

+ 1 - 0
examples/starpufft/cudax_kernels.h

@@ -16,6 +16,7 @@
  */
 
 #include <cuComplex.h>
+#include <starpu_cuda.h>
 _externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
 _externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
 _externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);

+ 7 - 3
examples/starpufft/starpufftx1d.c

@@ -72,7 +72,7 @@ STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
 	
 	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* fft1:
@@ -97,6 +97,8 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
@@ -107,7 +109,7 @@ STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
 
 	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* fft2:
@@ -132,6 +134,8 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
@@ -141,7 +145,7 @@ STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
 	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 7 - 3
examples/starpufft/starpufftx2d.c

@@ -41,7 +41,7 @@ STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
 	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* Perform an n2,m2 fft */
@@ -67,6 +67,8 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 
 	if (!plan->plans[workerid].initialized1) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized1 = 1;
@@ -78,7 +80,7 @@ STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
 	/* synchronization is done after the twiddling */
 	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void
@@ -104,6 +106,8 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 
 	if (!plan->plans[workerid].initialized2) {
 		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
 
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		plan->plans[workerid].initialized2 = 1;
@@ -114,7 +118,7 @@ STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 	}
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 2 - 1
examples/stencil/life_cuda.cu

@@ -16,6 +16,7 @@
 
 #define _externC extern "C"
 #include "stencil.h"
+#include <starpu_cuda.h>
 
 /* Heart of the stencil computation: compute a new state from an old one. */
 
@@ -72,5 +73,5 @@ cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int n
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
-	cuda_life_update <<<dimGrid, dimBlock>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
 }

+ 2 - 1
examples/stencil/shadow.cu

@@ -16,6 +16,7 @@
 
 #define _externC extern "C"
 #include "stencil.h"
+#include <starpu_cuda.h>
 
 /* Perform replication of data on X and Y edges, to fold the domain on 
    itself through mere replication of the source state. */
@@ -54,5 +55,5 @@ cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz, in
 	dim3 dimBlock(threads_per_dim_x, threads_per_dim_y);
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
-	cuda_shadow <<<dimGrid, dimBlock>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
 }

+ 6 - 6
examples/stencil/stencil-kernels.c

@@ -173,7 +173,7 @@ static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
-	cudaMemcpy(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 }
 
 /*
@@ -243,12 +243,12 @@ fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid)
 #ifdef LIFE
 		cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
 #else
-		cudaMemcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice);
+		cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 #endif /* LIFE */
 	}
 
 	cudaError_t cures;
-	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
+	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 }
@@ -407,7 +407,7 @@ static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
 	unsigned offset = firstz*block->ldz;
 	TYPE *block_data = (TYPE *)block->ptr;
 	TYPE *boundary_data = (TYPE *)boundary->ptr;
-	cudaMemcpy(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 }
 #endif /* STARPU_USE_CUDA */
 
@@ -459,7 +459,7 @@ static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* bottom save, CUDA version */
@@ -473,7 +473,7 @@ static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif /* STARPU_USE_CUDA */
 

+ 1 - 1
include/starpu_cuda.h

@@ -79,7 +79,7 @@ extern "C" {
 		STARPU_CUBLAS_OOPS();					\
 	} while (0)
 
-cudaStream_t *starpu_cuda_get_local_stream(void);
+cudaStream_t starpu_cuda_get_local_stream(void);
 
 #ifdef __cplusplus
 }

+ 3 - 3
include/starpu_data_interfaces.h

@@ -66,9 +66,9 @@ struct starpu_data_copy_methods {
 
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
-	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+	int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 
 #ifdef STARPU_USE_OPENCL

+ 3 - 1
mpi/tests/ring_kernel.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void cuda_incrementer(unsigned *token)
 {
@@ -26,5 +27,6 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	(void) _args;
 	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	cuda_incrementer<<<1,1>>>(tokenptr);
+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 3
src/datawizard/copy_driver.c

@@ -102,7 +102,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
 #ifdef STARPU_USE_CUDA
 	cudaError_t cures;
-	cudaStream_t *stream;
+	cudaStream_t stream;
 #endif
 
 	void *src_interface = src_replicate->interface;
@@ -132,7 +132,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 				stream = starpu_cuda_get_local_stream();
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 
-				cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
+				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
 				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			}
 		}
@@ -157,7 +157,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			stream = starpu_cuda_get_local_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
-			cures = cudaEventRecord(req->async_channel.cuda_event, *stream);
+			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		break;

+ 8 - 8
src/datawizard/interfaces/block_interface.c

@@ -31,8 +31,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -435,7 +435,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -457,7 +457,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, kind, *stream);
+					nx*ny*nz*elemsize, kind, stream);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
@@ -476,7 +476,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 			/* Are all plans contiguous */
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
-					nx*ny*elemsize, nz, kind, *stream);
+					nx*ny*elemsize, nz, kind, stream);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -502,7 +502,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
-                                                  nx*elemsize, ny, kind, *stream);
+                                                  nx*elemsize, ny, kind, stream);
 
 			if (STARPU_UNLIKELY(cures))
 			{
@@ -557,12 +557,12 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }

+ 6 - 6
src/datawizard/interfaces/matrix_interface.c

@@ -30,8 +30,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -385,7 +385,7 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -396,7 +396,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
 			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
-			cudaMemcpyDeviceToHost, *stream);
+			cudaMemcpyDeviceToHost, stream);
 	if (cures)
 	{
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
@@ -415,7 +415,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 	return -EAGAIN;
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -426,7 +426,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 				src_matrix->nx*elemsize, src_matrix->ny,
-				cudaMemcpyHostToDevice, *stream);
+				cudaMemcpyHostToDevice, stream);
 	if (cures)
 	{
 		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,

+ 6 - 6
src/datawizard/interfaces/variable_interface.c

@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_int
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -305,13 +305,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
-					cudaStream_t *stream, enum cudaMemcpyKind kind)
+					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, stream);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -330,13 +330,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }

+ 6 - 6
src/datawizard/interfaces/vector_interface.c

@@ -29,8 +29,8 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -340,13 +340,13 @@ static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute_
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)),
-					cudaStream_t *stream, enum cudaMemcpyKind kind)
+					cudaStream_t stream, enum cudaMemcpyKind kind)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
@@ -364,13 +364,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
 }
 
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }

+ 2 - 2
src/datawizard/interfaces/void_interface.c

@@ -26,7 +26,7 @@
 
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
 static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event);
@@ -141,7 +141,7 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 				unsigned src_node __attribute__((unused)),
 				void *dst_interface __attribute__((unused)),
 				unsigned dst_node __attribute__((unused)),
-				cudaStream_t *stream __attribute__ ((unused)))
+				cudaStream_t stream __attribute__ ((unused)))
 {
 	return 0;
 }

+ 4 - 3
src/drivers/cuda/driver_cuda.c

@@ -83,16 +83,17 @@ static void unlimit_gpu_mem_if_needed(int devid)
 	}
 }
 
-cudaStream_t *starpu_cuda_get_local_stream(void)
+cudaStream_t starpu_cuda_get_local_stream(void)
 {
 	int worker = starpu_worker_get_id();
 
-	return &streams[worker];
+	return streams[worker];
 }
 
 static void init_context(int devid)
 {
 	cudaError_t cures;
+	int workerid = starpu_worker_get_id();
 
 	cures = cudaSetDevice(devid);
 	if (STARPU_UNLIKELY(cures))
@@ -103,7 +104,7 @@ static void init_context(int devid)
 
 	limit_gpu_mem_if_needed(devid);
 
-	cures = cudaStreamCreate(starpu_cuda_get_local_stream());
+	cures = cudaStreamCreate(&streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 }

+ 3 - 2
tests/datawizard/acquire_release_cuda.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void _increment_cuda_codelet(unsigned *val)
 {
@@ -25,7 +26,7 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 {
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-	_increment_cuda_codelet<<<1,1>>>(val);
+	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 2
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void _cuda_unsigned_inc(unsigned *val)
 {
@@ -25,7 +26,7 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 {
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-	_cuda_unsigned_inc<<<1,1>>>(val);
+	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 2
tests/datawizard/data_invalidation.c

@@ -18,6 +18,7 @@
 #include <unistd.h>
 #include <errno.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 #include <stdlib.h>
 
 #define NLOOPS		1000
@@ -35,8 +36,8 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
-	cudaMemset(buf, 42, length);
-	cudaThreadSynchronize();
+	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 3 - 2
tests/datawizard/scratch_cuda.cu

@@ -16,6 +16,7 @@
 
 #include <stdio.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 #define MAXNBLOCKS		32
 #define MAXTHREADSPERBLOCK	128
@@ -45,6 +46,6 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	unsigned nblocks = 128;
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 	
-	increment_vector<<<nblocks, nthread_per_block>>>(v, tmp, nx);
-	cudaThreadSynchronize();
+	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 5 - 4
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 /*
  *	increment a (val[0])
@@ -29,9 +30,9 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 {
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	_cuda_incA<<<1,1>>>(v);
+	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /*
@@ -47,7 +48,7 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 {
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	_cuda_incC<<<1,1>>>(v);
+	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
 
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }