13 년 전 · 57e59bf2d9
--- a/examples/filters/shadow.c
+++ b/examples/filters/shadow.c
@@ -85,7 +85,7 @@ void cuda_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 	/* If things go right, sizes should match */
			
 
				 	STARPU_ASSERT(n == n2);
			
 
				-	cudaMemcpy(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice);
			
 
				+	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
--- a/examples/filters/shadow2d.c
+++ b/examples/filters/shadow2d.c
@@ -139,7 +139,7 @@ void cuda_func(void *buffers[], void *cl_arg)
 
				 	/* If things go right, sizes should match */
			
 
				 	STARPU_ASSERT(n == n2);
			
 
				 	STARPU_ASSERT(m == m2);
			
 
				-	cudaMemcpy2D(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice);
			
 
				+	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -28,6 +28,7 @@
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				 #include <cublas.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "../common/blas.h"
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -344,8 +344,8 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				float pivot;
			
 
				-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
			
 
				-				cudaStreamSynchronize(0);
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0f);
			
 
				 				
			
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -328,8 +328,8 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				-				cudaStreamSynchronize(0);
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				STARPU_ASSERT(fpclassify(pivot) != FP_ZERO);
			
 
				 				
			
@@ -450,8 +450,8 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				-				cudaStreamSynchronize(0);
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
			
 
				 				{
			
@@ -466,8 +466,8 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				 					}
			
 
				 
			
 
				-					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				-					cudaStreamSynchronize(0);
			
 
				+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+					cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 				}
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
--- a/examples/pi/pi_redux.c
+++ b/examples/pi/pi_redux.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -263,12 +263,14 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned long h_a, h_b;
			
 
				 	
			
 
				-	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
			
 
				-	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
			
 
				+	cudaMemcpyAsync(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaMemcpyAsync(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	h_a += h_b;
			
 
				 
			
 
				-	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
			
 
				+	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 };
			
 
				 #endif
			
 
				 
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2012 inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -237,20 +237,16 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
			
 
				 
			
 
				 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	current_dot += local_dot;
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				-
			
 
				-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/mpi/examples/cholesky/mpi_cholesky_kernels.c
+++ b/mpi/examples/cholesky/mpi_cholesky_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -195,8 +195,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				float lambda11;
			
 
				-				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
			
 
				-				cudaStreamSynchronize(0);
			
 
				+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				 
			
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -381,8 +381,8 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				-				cudaStreamSynchronize(0);
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
 
				 				
			
--- a/tests/datawizard/handle_to_pointer.c
+++ b/tests/datawizard/handle_to_pointer.c
@@ -21,6 +21,9 @@
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 #include <starpu_opencl.h>
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cuda.h>
			
 
				+#endif
			
 
				 #include <stdlib.h>
			
 
				 #include "../helper.h"
			
 
				 
			
@@ -51,8 +54,9 @@ static void cuda_task(void **buffers, void *args)
 
				 
			
 
				 	for(i = 0; i < size; i++)
			
 
				 	{
			
 
				-		cudaMemcpy(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice);
			
 
				+		cudaMemcpyAsync(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				 	}
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/increment_redux.c
+++ b/tests/datawizard/increment_redux.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -44,14 +44,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_dst, host_src;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_dst += host_src;
			
 
				 
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void neutral_cuda_kernel(void *descr[], void *arg)
			
@@ -62,8 +62,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				 	unsigned host_dst = 0;
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -183,13 +183,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_token;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_token++;
			
 
				 
			
 
				-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/increment_redux_lazy.c
+++ b/tests/datawizard/increment_redux_lazy.c
@@ -40,14 +40,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_dst, host_src;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_dst += host_src;
			
 
				 
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void neutral_cuda_kernel(void *descr[], void *arg)
			
@@ -56,8 +56,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				 	unsigned host_dst = 0;
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -165,13 +165,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_token;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_token++;
			
 
				 
			
 
				-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/increment_redux_v2.c
+++ b/tests/datawizard/increment_redux_v2.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -43,14 +43,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_dst, host_src;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_dst += host_src;
			
 
				 
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
 
				 static void neutral_cuda_kernel(void *descr[], void *arg)
			
@@ -61,8 +61,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				 	unsigned host_dst = 0;
			
 
				-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -182,13 +182,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_token;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_token++;
			
 
				 
			
 
				-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/lazy_allocation.c
+++ b/tests/datawizard/lazy_allocation.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2012       inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -134,7 +134,8 @@ static void cuda_check_content_codelet(void *descr[], __attribute__ ((unused)) v
 
				 	for (i = 0; i < length; i++)
			
 
				 	{
			
 
				 		char dst;
			
 
				-		cudaMemcpy(&dst, &buf[i], sizeof(char), cudaMemcpyDeviceToHost);
			
 
				+		cudaMemcpyAsync(&dst, &buf[i], sizeof(char), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 		if (dst != 42)
			
 
				 		{
			
 
				 			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, dst, 42);
			
--- a/tests/datawizard/manual_reduction.c
+++ b/tests/datawizard/manual_reduction.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -21,6 +21,7 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 #include <starpu_opencl.h>
			
@@ -125,9 +126,11 @@ static void cuda_func_incr(void *descr[], void *cl_arg __attribute__((unused)))
 
				 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	unsigned h_val;
			
 
				-	cudaMemcpy(&h_val, val, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				+	cudaMemcpyAsync(&h_val, val, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	h_val++;
			
 
				-	cudaMemcpy(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				+	cudaMemcpyAsync(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/wt_broadcast.c
+++ b/tests/datawizard/wt_broadcast.c
@@ -60,13 +60,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_token;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_token++;
			
 
				 
			
 
				-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/datawizard/wt_host.c
+++ b/tests/datawizard/wt_host.c
@@ -60,13 +60,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
				 	unsigned host_token;
			
 
				 
			
 
				 	/* This is a dummy technique of course */
			
 
				-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	host_token++;
			
 
				 
			
 
				-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/tests/microbenchs/matrix_as_vector.c
+++ b/tests/microbenchs/matrix_as_vector.c
@@ -20,6 +20,7 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #  include <cublas.h>
			
 
				+#  include <starpu_cuda.h>
			
 
				 #endif
			
 
				 
			
 
				 #define LOOPS 100
			
@@ -49,8 +50,8 @@ void vector_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 
				 	cudaThreadSynchronize();
			
 
				 	sum /= nx;
			
 
				 
			
 
				-	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 }
			
 
				 
			
@@ -81,8 +82,8 @@ void matrix_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 
				 	cudaThreadSynchronize();
			
 
				 	sum /= nx*ny;
			
 
				 
			
 
				-	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 }