Преглед изворни кода

Enable asynchronous CUDA kernels wherever possible

Samuel Thibault пре 11 година
родитељ
комит
7d2bf0c630
61 измењених фајлова са 117 додато и 138 уклоњено
  1. 2 2
      examples/audio/starpu_audio_processing.c
  2. 2 2
      examples/axpy/axpy.c
  3. 1 0
      examples/basic_examples/multiformat_conversion_codelets.c
  4. 0 2
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  5. 9 11
      examples/cg/cg_kernels.c
  6. 0 2
      examples/filters/custom_mf/conversion.cu
  7. 0 2
      examples/filters/custom_mf/cuda.cu
  8. 1 0
      examples/filters/custom_mf/custom_conversion_codelets.c
  9. 1 0
      examples/filters/custom_mf/custom_mf_filter.c
  10. 2 1
      examples/filters/fblock.c
  11. 0 2
      examples/filters/fblock_cuda.cu
  12. 2 2
      examples/filters/shadow.c
  13. 2 2
      examples/filters/shadow2d.c
  14. 2 3
      examples/filters/shadow3d.c
  15. 4 1
      examples/heat/dw_factolu.c
  16. 1 7
      examples/heat/dw_factolu_kernels.c
  17. 1 0
      examples/interface/complex.c
  18. 0 2
      examples/interface/complex_kernels.cu
  19. 5 11
      examples/lu/xlu_kernels.c
  20. 2 2
      examples/mult/xgemm.c
  21. 3 3
      examples/pi/pi_redux.c
  22. 4 3
      examples/reductions/dot_product.c
  23. 0 1
      examples/reductions/dot_product_kernels.cu
  24. 2 1
      examples/spmv/spmv.c
  25. 1 4
      examples/spmv/spmv_cuda.cu
  26. 4 7
      examples/stencil/stencil-kernels.c
  27. 2 1
      tests/datawizard/acquire_release.c
  28. 1 0
      tests/datawizard/acquire_release2.c
  29. 0 2
      tests/datawizard/acquire_release_cuda.cu
  30. 1 3
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  31. 2 2
      tests/datawizard/data_invalidation.c
  32. 1 1
      tests/datawizard/handle_to_pointer.c
  33. 4 4
      tests/datawizard/increment_redux.c
  34. 4 4
      tests/datawizard/increment_redux_lazy.c
  35. 5 4
      tests/datawizard/increment_redux_v2.c
  36. 1 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
  37. 0 2
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
  38. 2 2
      tests/datawizard/lazy_allocation.c
  39. 2 2
      tests/datawizard/manual_reduction.c
  40. 2 1
      tests/datawizard/mpi_like.c
  41. 2 1
      tests/datawizard/mpi_like_async.c
  42. 2 1
      tests/datawizard/partition_lazy.c
  43. 2 1
      tests/datawizard/scal.c
  44. 1 3
      tests/datawizard/scal_cuda.cu
  45. 2 1
      tests/datawizard/scratch.c
  46. 1 2
      tests/datawizard/scratch_cuda.cu
  47. 1 0
      tests/datawizard/specific_node.c
  48. 3 1
      tests/datawizard/sync_and_notify_data.c
  49. 3 1
      tests/datawizard/sync_and_notify_data_implicit.c
  50. 1 5
      tests/datawizard/sync_and_notify_data_kernels.cu
  51. 2 2
      tests/datawizard/write_only_tmp_buffer.c
  52. 2 2
      tests/datawizard/wt_broadcast.c
  53. 2 2
      tests/datawizard/wt_host.c
  54. 1 5
      tests/main/increment.cu
  55. 2 1
      tests/main/subgraph_repeat.c
  56. 2 1
      tests/main/subgraph_repeat_regenerate.c
  57. 2 1
      tests/main/subgraph_repeat_regenerate_tag.c
  58. 2 1
      tests/main/subgraph_repeat_tag.c
  59. 2 2
      tests/microbenchs/matrix_as_vector.c
  60. 2 2
      tests/perfmodels/non_linear_regression_based.c
  61. 2 2
      tests/perfmodels/regression_based.c

+ 2 - 2
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -211,7 +211,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -284,6 +283,7 @@ static struct starpu_codelet band_filter_cl =
 	.modes = { STARPU_RW },
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {band_filter_kernel_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs = {band_filter_kernel_cpu, NULL},
 	.model = &band_filter_model,

+ 2 - 2
examples/axpy/axpy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -71,7 +71,6 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -84,6 +83,7 @@ static struct starpu_codelet axpy_cl =
 	.cpu_funcs = {axpy_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {axpy_opencl, NULL},

+ 1 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -37,6 +37,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"
 };

+ 0 - 2
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,6 +44,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 9 - 11
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -73,7 +73,6 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -97,6 +96,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -111,7 +111,6 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -136,6 +135,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -154,8 +154,6 @@ static void bzero_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	zero_vector(v, 1);
- 
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -177,6 +175,7 @@ struct starpu_codelet bzero_variable_cl =
 	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -190,8 +189,6 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	zero_vector(v, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -215,6 +212,7 @@ struct starpu_codelet bzero_vector_cl =
 	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -322,7 +320,6 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	cublasscal(n, alpha, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -350,6 +347,7 @@ static struct starpu_codelet scal_kernel_cl =
 	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.model = &scal_kernel_model
@@ -375,7 +373,6 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -425,6 +422,7 @@ static struct starpu_codelet gemv_kernel_cl =
 	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {gemv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
@@ -488,7 +486,6 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 */
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -522,6 +519,7 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
@@ -565,7 +563,6 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	cublasaxpy(n, p1, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -596,6 +593,7 @@ static struct starpu_codelet axpy_kernel_cl =
 	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &axpy_kernel_model

+ 0 - 2
examples/filters/custom_mf/conversion.cu

@@ -45,6 +45,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 0 - 2
examples/filters/custom_mf/cuda.cu

@@ -39,6 +39,4 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -40,6 +40,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"

+ 1 - 0
examples/filters/custom_mf/custom_mf_filter.c

@@ -158,6 +158,7 @@ static struct starpu_codelet cpu_cl =
 static struct starpu_codelet cuda_cl =
 {
 	.cuda_funcs = { custom_scal_cuda_func, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
 	.name = "cuda_codelet"

+ 2 - 1
examples/filters/fblock.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,6 +95,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
                 .opencl_funcs = {opencl_func, NULL},

+ 0 - 2
examples/filters/fblock_cuda.cu

@@ -41,6 +41,4 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
examples/filters/shadow.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -85,7 +85,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	/* If things go right, sizes should match */
 	STARPU_ASSERT(n == n2);
 	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -103,6 +102,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 2
examples/filters/shadow2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -139,7 +139,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	STARPU_ASSERT(n == n2);
 	STARPU_ASSERT(m == m2);
 	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -157,6 +156,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 3
examples/filters/shadow3d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -105,8 +105,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 		STARPU_ASSERT(!cures);
 	}
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	STARPU_ASSERT(!cures);
 }
 #endif
 
@@ -124,6 +122,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 4 - 1
examples/heat/dw_factolu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -55,6 +55,7 @@ static struct starpu_codelet cl12 =
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -66,6 +67,7 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -77,6 +79,7 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 1 - 7
examples/heat/dw_factolu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,8 +137,6 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -202,8 +200,6 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -264,8 +260,6 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:

+ 1 - 0
examples/interface/complex.c

@@ -53,6 +53,7 @@ struct starpu_codelet cl_copy =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {copy_complex_codelet_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {copy_complex_codelet_opencl, NULL},

+ 0 - 2
examples/interface/complex_kernels.cu

@@ -44,6 +44,4 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 5 - 11
examples/lu/xlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,9 +70,6 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 		}
 #endif
@@ -133,6 +130,7 @@ struct starpu_codelet cl22 =
 	.cpu_funcs = {STARPU_LU(cpu_u22), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u22), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 3,
@@ -180,9 +178,6 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 #endif
 		default:
@@ -221,6 +216,7 @@ struct starpu_codelet cl12 =
 	.cpu_funcs = {STARPU_LU(cpu_u12), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u12), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -266,8 +262,6 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -306,6 +300,7 @@ struct starpu_codelet cl21 =
 	.cpu_funcs = {STARPU_LU(cpu_u21), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u21), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -596,8 +591,6 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -637,6 +630,7 @@ struct starpu_codelet cl_pivot =
 	.cpu_funcs = {STARPU_LU(cpu_pivot), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_pivot), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 1,

+ 2 - 2
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -177,7 +177,6 @@ static void mult_kernel_common(void *descr[], int type)
 	{
 		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 					     (TYPE)0.0, subC, ldC);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	}
 #endif
 }
@@ -207,6 +206,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_mult, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_mult, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 3 - 3
examples/pi/pi_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -252,7 +252,6 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         cudaMemsetAsync(val, 0, sizeof(unsigned long), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -262,6 +261,7 @@ static struct starpu_codelet init_codelet =
         .cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
         .cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
         .nbuffers = 1
@@ -283,7 +283,6 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 	h_a += h_b;
 
 	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -301,6 +300,7 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2

+ 4 - 3
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -75,7 +75,6 @@ void init_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -111,6 +110,7 @@ static struct starpu_codelet init_codelet =
 	.cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {init_opencl_func, NULL},
@@ -194,6 +194,7 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_func, NULL},
@@ -247,7 +248,6 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	current_dot += local_dot;
 
 	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -313,6 +313,7 @@ static struct starpu_codelet dot_codelet =
 	.cpu_funcs_name = {"dot_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dot_opencl_func, NULL},

+ 0 - 1
examples/reductions/dot_product_kernels.cu

@@ -31,5 +31,4 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 1
examples/spmv/spmv.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -98,6 +98,7 @@ static struct starpu_codelet spmv_cl =
 	.cpu_funcs = {spmv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {spmv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
         .opencl_funcs = {spmv_kernel_opencl, NULL},

+ 1 - 4
examples/spmv/spmv_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -98,9 +98,6 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 }
 
 

+ 4 - 7
examples/stencil/stencil-kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -264,10 +264,6 @@ static void update_func_cuda(void *descr[], void *arg)
 #endif /* LIFE */
 	}
 
-	cudaError_t cures;
-	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
-		STARPU_CUDA_REPORT_ERROR(cures);
-
 	if (block->bz == 0)
 		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
@@ -465,6 +461,7 @@ struct starpu_codelet cl_update =
 	.cpu_funcs_name = {"update_func_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {update_func_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {update_func_opencl, NULL},
@@ -591,7 +588,6 @@ static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* bottom save, CUDA version */
@@ -605,7 +601,6 @@ static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif /* STARPU_USE_CUDA */
 
@@ -667,6 +662,7 @@ struct starpu_codelet save_cl_bottom =
 	.cpu_funcs_name = {"dummy_func_bottom_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
@@ -682,6 +678,7 @@ struct starpu_codelet save_cl_top =
 	.cpu_funcs_name = {"dummy_func_top_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_top_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_top_opencl, NULL},

+ 2 - 1
tests/datawizard/acquire_release.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,6 +44,7 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},

+ 1 - 0
tests/datawizard/acquire_release2.c

@@ -44,6 +44,7 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},

+ 0 - 2
tests/datawizard/acquire_release_cuda.cu

@@ -26,6 +26,4 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 3
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,4 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/data_invalidation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -45,7 +45,6 @@ static void cuda_memset_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_ar
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -93,6 +92,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_memset_codelet, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_memset_codelet, NULL},

+ 1 - 1
tests/datawizard/handle_to_pointer.c

@@ -50,7 +50,6 @@ static void cuda_task(void **buffers, void *args)
 	{
 		cudaMemcpyAsync(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	}
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -87,6 +86,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_task, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_task, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_task, NULL},

+ 4 - 4
tests/datawizard/increment_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,7 +44,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -56,7 +55,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -119,6 +117,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -133,6 +132,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -182,7 +182,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -198,6 +197,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 4 - 4
tests/datawizard/increment_redux_lazy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -50,7 +49,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -105,6 +103,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -119,6 +118,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -164,7 +164,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -178,6 +177,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 5 - 4
tests/datawizard/increment_redux_v2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2013  Université de Bordeaux 1
+ * Copyright (C) 2011-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,7 +43,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -55,7 +54,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -118,6 +116,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -132,6 +131,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -181,7 +181,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -197,6 +196,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},
@@ -211,6 +211,7 @@ struct starpu_codelet increment_cl_redux =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 1 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c

@@ -38,6 +38,7 @@ void cuda_to_cpu(void *buffers[], void *arg)
 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args); struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1
 };
 

+ 0 - 2
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu

@@ -45,6 +45,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/lazy_allocation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012       inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,6 @@ static void cuda_memset_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_ar
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -88,6 +87,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_memset_codelet, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_memset_codelet, NULL},

+ 2 - 2
tests/datawizard/manual_reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -117,7 +117,6 @@ static void cuda_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	h_val++;
 	cudaMemcpyAsync(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -146,6 +145,7 @@ static struct starpu_codelet use_data_on_worker_codelet =
 	.cpu_funcs = {cpu_func_incr, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_func_incr, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_func_incr, NULL},

+ 2 - 1
tests/datawizard/mpi_like.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -60,6 +60,7 @@ static struct starpu_codelet increment_handle_cl =
 	.cpu_funcs = {increment_handle_cpu_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},

+ 2 - 1
tests/datawizard/mpi_like_async.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -88,6 +88,7 @@ static struct starpu_codelet increment_handle_cl =
 	.cpu_funcs = {increment_handle_cpu_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = { opencl_codelet_unsigned_inc, NULL},

+ 2 - 1
tests/datawizard/partition_lazy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,7 @@ struct starpu_codelet mycodelet =
 #endif
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = { scal_func_cuda, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs_name = {"scal_func_cpu", NULL},
 	.modes = { STARPU_W },

+ 2 - 1
tests/datawizard/scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -95,6 +95,7 @@ struct starpu_codelet scal_codelet =
 #endif
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = { scal_func_cuda, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs_name = {"scal_func_cpu", NULL},
 	.modes = { STARPU_RW },

+ 1 - 3
tests/datawizard/scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +33,4 @@ extern "C" void scal_func_cuda(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 1
tests/datawizard/scratch.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -67,6 +67,7 @@ static struct starpu_codelet cl_f =
 	.cpu_funcs = {cpu_f, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_f, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_f, NULL},

+ 1 - 2
tests/datawizard/scratch_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -47,5 +47,4 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 	
 	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
tests/datawizard/specific_node.c

@@ -60,6 +60,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {specific_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},

+ 3 - 1
tests/datawizard/sync_and_notify_data.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -105,6 +105,7 @@ int main(int argc, char **argv)
 				.cpu_funcs = {cpu_codelet_incA, NULL},
 #ifdef STARPU_USE_CUDA
 				.cuda_funcs = {cuda_codelet_incA, NULL},
+				.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 				.opencl_funcs = {opencl_codelet_incA, NULL},
@@ -142,6 +143,7 @@ int main(int argc, char **argv)
 				.cpu_funcs = {cpu_codelet_incC, NULL},
 #ifdef STARPU_USE_CUDA
 				.cuda_funcs = {cuda_codelet_incC, NULL},
+				.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 				.opencl_funcs = {opencl_codelet_incC, NULL},

+ 3 - 1
tests/datawizard/sync_and_notify_data_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -78,6 +78,7 @@ static struct starpu_codelet cl_inc_a =
 	.cpu_funcs = {cpu_codelet_incA, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_incA, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_incA, NULL},
@@ -93,6 +94,7 @@ struct starpu_codelet cl_inc_c =
 	.cpu_funcs = {cpu_codelet_incC, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_incC, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_incC, NULL},

+ 1 - 5
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,8 +31,6 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /*
@@ -49,6 +47,4 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,6 @@ static void cuda_codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cudaMemsetAsync(buf, 42, 1, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -80,6 +79,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_codelet_null, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_null, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_null, NULL},

+ 2 - 2
tests/datawizard/wt_broadcast.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,7 +55,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -69,6 +68,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 2 - 2
tests/datawizard/wt_host.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,7 +59,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -75,6 +74,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 1 - 5
tests/main/increment.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Université de Bordeaux 1
+ * Copyright (C) 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,10 +25,6 @@ extern "C" void cuda_host_increment(void *descr[], void *_args)
 {
 	(void)_args;
 	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaError_t cures;
 
 	cuda_increment<<<1,1, 0, starpu_cuda_get_local_stream()>>>(var);
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	if (cures != cudaSuccess)
-		STARPU_CUDA_REPORT_ERROR(cures);
 }

+ 2 - 1
tests/main/subgraph_repeat.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -62,6 +62,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_regenerate.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -62,6 +62,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_regenerate_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,6 +68,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -61,6 +61,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 2
tests/microbenchs/matrix_as_vector.c

@@ -50,7 +50,6 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	sum /= nx;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -82,7 +81,6 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	sum /= nx*ny;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -180,6 +178,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	vector_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) vector_codelet.cpu_funcs[0] = vector_cpu_func;
 	if (where == STARPU_CUDA) vector_codelet.cuda_funcs[0] = vector_cuda_func;
+	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) vector_codelet.opencl_funcs[0] = vector_opencl_func;
 
 	starpu_codelet_init(&matrix_codelet);
@@ -187,6 +186,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	matrix_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) matrix_codelet.cpu_funcs[0] = matrix_cpu_func;
 	if (where == STARPU_CUDA) matrix_codelet.cuda_funcs[0] = matrix_cuda_func;
+	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) matrix_codelet.opencl_funcs[0] = matrix_opencl_func;
 
 	for(nx=NX_MIN ; nx<=NX_MAX ; nx*=2)

+ 2 - 2
tests/perfmodels/non_linear_regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -29,7 +29,6 @@ static void memset_cuda(void *descr[], void *arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -57,6 +56,7 @@ static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},

+ 2 - 2
tests/perfmodels/regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2013  Université de Bordeaux 1
+ * Copyright (C) 2011-2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2012 inria
  *
@@ -30,7 +30,6 @@ static void memset_cuda(void *descr[], void *arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -64,6 +63,7 @@ static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},