Olivier Aumage преди 11 години
родител
ревизия
f4d932acbf
променени са 68 файла, в които са добавени 314 реда и са изтрити 214 реда
  1. 2 2
      examples/audio/starpu_audio_processing.c
  2. 2 2
      examples/axpy/axpy.c
  3. 1 0
      examples/basic_examples/multiformat_conversion_codelets.c
  4. 0 2
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  5. 9 11
      examples/cg/cg_kernels.c
  6. 0 2
      examples/filters/custom_mf/conversion.cu
  7. 0 2
      examples/filters/custom_mf/cuda.cu
  8. 1 0
      examples/filters/custom_mf/custom_conversion_codelets.c
  9. 1 0
      examples/filters/custom_mf/custom_mf_filter.c
  10. 2 1
      examples/filters/fblock.c
  11. 0 2
      examples/filters/fblock_cuda.cu
  12. 2 2
      examples/filters/shadow.c
  13. 2 2
      examples/filters/shadow2d.c
  14. 2 3
      examples/filters/shadow3d.c
  15. 4 1
      examples/heat/dw_factolu.c
  16. 1 7
      examples/heat/dw_factolu_kernels.c
  17. 1 0
      examples/interface/complex.c
  18. 0 2
      examples/interface/complex_kernels.cu
  19. 5 11
      examples/lu/xlu_kernels.c
  20. 2 2
      examples/mult/xgemm.c
  21. 3 3
      examples/pi/pi_redux.c
  22. 4 3
      examples/reductions/dot_product.c
  23. 0 1
      examples/reductions/dot_product_kernels.cu
  24. 2 1
      examples/spmv/spmv.c
  25. 1 4
      examples/spmv/spmv_cuda.cu
  26. 4 7
      examples/stencil/stencil-kernels.c
  27. 1 1
      src/core/jobs.h
  28. 20 10
      src/datawizard/data_request.c
  29. 5 5
      src/datawizard/data_request.h
  30. 26 7
      src/datawizard/datawizard.c
  31. 3 2
      src/datawizard/datawizard.h
  32. 69 21
      src/drivers/cuda/driver_cuda.c
  33. 73 30
      src/drivers/opencl/driver_opencl.c
  34. 2 1
      tests/datawizard/acquire_release.c
  35. 1 0
      tests/datawizard/acquire_release2.c
  36. 0 2
      tests/datawizard/acquire_release_cuda.cu
  37. 1 3
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  38. 2 2
      tests/datawizard/data_invalidation.c
  39. 1 1
      tests/datawizard/handle_to_pointer.c
  40. 4 4
      tests/datawizard/increment_redux.c
  41. 4 4
      tests/datawizard/increment_redux_lazy.c
  42. 5 4
      tests/datawizard/increment_redux_v2.c
  43. 1 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
  44. 0 2
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
  45. 2 2
      tests/datawizard/lazy_allocation.c
  46. 2 2
      tests/datawizard/manual_reduction.c
  47. 2 1
      tests/datawizard/mpi_like.c
  48. 2 1
      tests/datawizard/mpi_like_async.c
  49. 2 1
      tests/datawizard/partition_lazy.c
  50. 2 1
      tests/datawizard/scal.c
  51. 1 3
      tests/datawizard/scal_cuda.cu
  52. 2 1
      tests/datawizard/scratch.c
  53. 1 2
      tests/datawizard/scratch_cuda.cu
  54. 1 0
      tests/datawizard/specific_node.c
  55. 3 1
      tests/datawizard/sync_and_notify_data.c
  56. 3 1
      tests/datawizard/sync_and_notify_data_implicit.c
  57. 1 5
      tests/datawizard/sync_and_notify_data_kernels.cu
  58. 2 2
      tests/datawizard/write_only_tmp_buffer.c
  59. 2 2
      tests/datawizard/wt_broadcast.c
  60. 2 2
      tests/datawizard/wt_host.c
  61. 1 5
      tests/main/increment.cu
  62. 2 1
      tests/main/subgraph_repeat.c
  63. 2 1
      tests/main/subgraph_repeat_regenerate.c
  64. 2 1
      tests/main/subgraph_repeat_regenerate_tag.c
  65. 2 1
      tests/main/subgraph_repeat_tag.c
  66. 2 2
      tests/microbenchs/matrix_as_vector.c
  67. 2 2
      tests/perfmodels/non_linear_regression_based.c
  68. 2 2
      tests/perfmodels/regression_based.c

+ 2 - 2
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -211,7 +211,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -284,6 +283,7 @@ static struct starpu_codelet band_filter_cl =
 	.modes = { STARPU_RW },
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {band_filter_kernel_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs = {band_filter_kernel_cpu, NULL},
 	.model = &band_filter_model,

+ 2 - 2
examples/axpy/axpy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -71,7 +71,6 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -84,6 +83,7 @@ static struct starpu_codelet axpy_cl =
 	.cpu_funcs = {axpy_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {axpy_opencl, NULL},

+ 1 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -37,6 +37,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"
 };

+ 0 - 2
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,6 +44,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 9 - 11
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -73,7 +73,6 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -97,6 +96,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -111,7 +111,6 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -136,6 +135,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -154,8 +154,6 @@ static void bzero_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	zero_vector(v, 1);
- 
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -177,6 +175,7 @@ struct starpu_codelet bzero_variable_cl =
 	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -190,8 +189,6 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	zero_vector(v, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -215,6 +212,7 @@ struct starpu_codelet bzero_vector_cl =
 	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -322,7 +320,6 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	cublasscal(n, alpha, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -350,6 +347,7 @@ static struct starpu_codelet scal_kernel_cl =
 	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.model = &scal_kernel_model
@@ -375,7 +373,6 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -425,6 +422,7 @@ static struct starpu_codelet gemv_kernel_cl =
 	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {gemv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
@@ -488,7 +486,6 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 */
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -522,6 +519,7 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
@@ -565,7 +563,6 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	cublasaxpy(n, p1, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -596,6 +593,7 @@ static struct starpu_codelet axpy_kernel_cl =
 	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &axpy_kernel_model

+ 0 - 2
examples/filters/custom_mf/conversion.cu

@@ -45,6 +45,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 0 - 2
examples/filters/custom_mf/cuda.cu

@@ -39,6 +39,4 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -40,6 +40,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"

+ 1 - 0
examples/filters/custom_mf/custom_mf_filter.c

@@ -158,6 +158,7 @@ static struct starpu_codelet cpu_cl =
 static struct starpu_codelet cuda_cl =
 {
 	.cuda_funcs = { custom_scal_cuda_func, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
 	.name = "cuda_codelet"

+ 2 - 1
examples/filters/fblock.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,6 +95,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
                 .opencl_funcs = {opencl_func, NULL},

+ 0 - 2
examples/filters/fblock_cuda.cu

@@ -41,6 +41,4 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
examples/filters/shadow.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -85,7 +85,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	/* If things go right, sizes should match */
 	STARPU_ASSERT(n == n2);
 	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -103,6 +102,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 2
examples/filters/shadow2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -139,7 +139,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	STARPU_ASSERT(n == n2);
 	STARPU_ASSERT(m == m2);
 	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -157,6 +156,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 3
examples/filters/shadow3d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -105,8 +105,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 		STARPU_ASSERT(!cures);
 	}
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	STARPU_ASSERT(!cures);
 }
 #endif
 
@@ -124,6 +122,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 4 - 1
examples/heat/dw_factolu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -55,6 +55,7 @@ static struct starpu_codelet cl12 =
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -66,6 +67,7 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -77,6 +79,7 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 1 - 7
examples/heat/dw_factolu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,8 +137,6 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -202,8 +200,6 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -264,8 +260,6 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:

+ 1 - 0
examples/interface/complex.c

@@ -53,6 +53,7 @@ struct starpu_codelet cl_copy =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {copy_complex_codelet_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {copy_complex_codelet_opencl, NULL},

+ 0 - 2
examples/interface/complex_kernels.cu

@@ -44,6 +44,4 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 5 - 11
examples/lu/xlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,9 +70,6 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 		}
 #endif
@@ -133,6 +130,7 @@ struct starpu_codelet cl22 =
 	.cpu_funcs = {STARPU_LU(cpu_u22), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u22), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 3,
@@ -180,9 +178,6 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 #endif
 		default:
@@ -221,6 +216,7 @@ struct starpu_codelet cl12 =
 	.cpu_funcs = {STARPU_LU(cpu_u12), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u12), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -266,8 +262,6 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -306,6 +300,7 @@ struct starpu_codelet cl21 =
 	.cpu_funcs = {STARPU_LU(cpu_u21), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u21), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -596,8 +591,6 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -637,6 +630,7 @@ struct starpu_codelet cl_pivot =
 	.cpu_funcs = {STARPU_LU(cpu_pivot), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_pivot), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 1,

+ 2 - 2
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -177,7 +177,6 @@ static void mult_kernel_common(void *descr[], int type)
 	{
 		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 					     (TYPE)0.0, subC, ldC);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	}
 #endif
 }
@@ -207,6 +206,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_mult, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_mult, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 3 - 3
examples/pi/pi_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -252,7 +252,6 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         cudaMemsetAsync(val, 0, sizeof(unsigned long), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -262,6 +261,7 @@ static struct starpu_codelet init_codelet =
         .cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
         .cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
         .nbuffers = 1
@@ -283,7 +283,6 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 	h_a += h_b;
 
 	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -301,6 +300,7 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2

+ 4 - 3
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -75,7 +75,6 @@ void init_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -111,6 +110,7 @@ static struct starpu_codelet init_codelet =
 	.cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {init_opencl_func, NULL},
@@ -194,6 +194,7 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_func, NULL},
@@ -247,7 +248,6 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	current_dot += local_dot;
 
 	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -313,6 +313,7 @@ static struct starpu_codelet dot_codelet =
 	.cpu_funcs_name = {"dot_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dot_opencl_func, NULL},

+ 0 - 1
examples/reductions/dot_product_kernels.cu

@@ -31,5 +31,4 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 1
examples/spmv/spmv.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -98,6 +98,7 @@ static struct starpu_codelet spmv_cl =
 	.cpu_funcs = {spmv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {spmv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
         .opencl_funcs = {spmv_kernel_opencl, NULL},

+ 1 - 4
examples/spmv/spmv_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -98,9 +98,6 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 }
 
 

+ 4 - 7
examples/stencil/stencil-kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -264,10 +264,6 @@ static void update_func_cuda(void *descr[], void *arg)
 #endif /* LIFE */
 	}
 
-	cudaError_t cures;
-	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
-		STARPU_CUDA_REPORT_ERROR(cures);
-
 	if (block->bz == 0)
 		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
@@ -465,6 +461,7 @@ struct starpu_codelet cl_update =
 	.cpu_funcs_name = {"update_func_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {update_func_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {update_func_opencl, NULL},
@@ -591,7 +588,6 @@ static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* bottom save, CUDA version */
@@ -605,7 +601,6 @@ static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif /* STARPU_USE_CUDA */
 
@@ -667,6 +662,7 @@ struct starpu_codelet save_cl_bottom =
 	.cpu_funcs_name = {"dummy_func_bottom_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
@@ -682,6 +678,7 @@ struct starpu_codelet save_cl_top =
 	.cpu_funcs_name = {"dummy_func_top_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_top_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_top_opencl, NULL},

+ 1 - 1
src/core/jobs.h

@@ -123,7 +123,7 @@ LIST_TYPE(_starpu_job,
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	unsigned reduction_task;
 
-	/* Used by MIC driver to record codelet start time instead of using a
+	/* Used to record codelet start time instead of using a
 	 * local variable */
 	struct timespec cl_start;
 

+ 20 - 10
src/datawizard/data_request.c

@@ -415,13 +415,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *empty_list;
 	int ret = 0;
 
+	*pushed = 0;
+
 	/* Here helgrind would should that this is an un protected access.
 	 * We however don't care about missing an entry, we will get called
 	 * again sooner or later. */
@@ -479,6 +481,8 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 			_starpu_data_request_list_push_back(new_data_requests, r);
 			break;
 		}
+
+		*pushed++;
 	}
 
 	while (!_starpu_data_request_list_empty(local_list))
@@ -500,13 +504,15 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	return ret;
 }
 
-void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc)
+void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_prefetch_requests;
 	struct _starpu_data_request_list *empty_list;
 
+	*pushed = 0;
+
 	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
 		return;
 
@@ -563,6 +569,8 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 			}
 			break;
 		}
+
+		*pushed++;
 	}
 
 	while(!_starpu_data_request_list_empty(local_list))
@@ -590,7 +598,7 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 	_starpu_data_request_list_delete(local_list);
 }
 
-static void _handle_pending_node_data_requests(unsigned src_node, unsigned force)
+static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 {
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
@@ -599,12 +607,12 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 	unsigned taken, kept;
 
 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
-		return;
+		return 0;
 
 	empty_list = _starpu_data_request_list_new();
 	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]) && !force)
 		/* List is busy, do not bother with it */
-		return;
+		return 0;
 
 	/* for all entries of the list */
 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
@@ -613,7 +621,7 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 		/* there is no request */
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 		_starpu_data_request_list_delete(empty_list);
-		return;
+		return 0;
 	}
 	data_requests_pending[src_node] = empty_list;
 
@@ -680,16 +688,18 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
 	_starpu_data_request_list_delete(local_list);
 	_starpu_data_request_list_delete(new_data_requests_pending);
+
+	return taken - kept;
 }
 
-void _starpu_handle_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_pending_node_data_requests(unsigned src_node)
 {
-	_handle_pending_node_data_requests(src_node, 0);
+	return _handle_pending_node_data_requests(src_node, 0);
 }
 
-void _starpu_handle_all_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
 {
-	_handle_pending_node_data_requests(src_node, 1);
+	return _handle_pending_node_data_requests(src_node, 1);
 }
 
 int _starpu_check_that_no_data_request_exists(unsigned node)

+ 5 - 5
src/datawizard/data_request.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -110,11 +110,11 @@ void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node);
 /* returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
-void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc);
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
+void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
 
-void _starpu_handle_pending_node_data_requests(unsigned src_node);
-void _starpu_handle_all_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
 
 int _starpu_check_that_no_data_request_exists(unsigned node);
 

+ 26 - 7
src/datawizard/datawizard.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,8 +24,10 @@
 #include <msg/msg.h>
 #endif
 
-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
+int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
 {
+	int ret = 0;
+
 #if STARPU_DEVEL
 #warning FIXME
 #endif
@@ -35,11 +37,28 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 	STARPU_UYIELD();
 
 	/* in case some other driver requested data */
-	_starpu_handle_pending_node_data_requests(memory_node);
-	if (_starpu_handle_node_data_requests(memory_node, may_alloc) == 0)
-		/* We pushed all pending requests, we can afford pushing
-		 * prefetch requests */
-		_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
+	if (_starpu_handle_pending_node_data_requests(memory_node))
+		ret = 1;
+	if (push_requests)
+	{
+		unsigned pushed;
+		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
+		{
+			if (pushed)
+				ret = 1;
+			/* We pushed all pending requests, we can afford pushing
+			 * prefetch requests */
+			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
+		}
+		if (pushed)
+			ret = 1;
+	}
 	_starpu_execute_registered_progression_hooks();
+
+	return ret;
 }
 
+int _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
+{
+	__starpu_datawizard_progress(memory_node, may_alloc, 1);
+}

+ 3 - 2
src/datawizard/datawizard.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,6 +33,7 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc);
+int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
+int _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc);
 
 #endif // __DATAWIZARD_H__

+ 69 - 21
src/drivers/cuda/driver_cuda.c

@@ -46,6 +46,7 @@ static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
+static cudaEvent_t task_events[STARPU_MAXCUDADEVS];
 #endif /* STARPU_USE_CUDA */
 
 void
@@ -252,6 +253,10 @@ static void init_context(unsigned devid)
 
 	workerid = starpu_worker_get_id();
 
+	cures = cudaEventCreate(&task_events[workerid]);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
 	cures = cudaStreamCreate(&streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
@@ -278,6 +283,7 @@ static void deinit_context(int workerid)
 	int devid = starpu_worker_get_devid(workerid);
 	int i;
 
+	cudaEventDestroy(task_events[workerid]);
 	cudaStreamDestroy(streams[workerid]);
 	cudaStreamDestroy(in_transfer_streams[devid]);
 	cudaStreamDestroy(out_transfer_streams[devid]);
@@ -327,21 +333,22 @@ void _starpu_init_cuda(void)
 	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
 }
 
-static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
 
-	struct timespec codelet_start, codelet_end;
-
 	int profiling = starpu_profiling_status_get();
 
 	STARPU_ASSERT(task);
 	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
+	_starpu_set_current_task(task);
+	args->current_task = j->task;
+
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
 	{
@@ -351,7 +358,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
@@ -367,18 +374,28 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 	}
 
+	return 0;
+}
+
+static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	struct timespec codelet_end;
+
+	int profiling = starpu_profiling_status_get();
+
+	_starpu_set_current_task(NULL);
+	args->current_task = NULL;
+
 	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &codelet_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
-	return 0;
+	_starpu_handle_job_termination(j);
 }
 
 /* XXX Should this be merged with _starpu_init_cuda ? */
@@ -440,12 +457,30 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker *args)
 	unsigned memnode = args->memory_node;
 	int workerid = args->workerid;
 
-	_STARPU_TRACE_START_PROGRESS(memnode);
-	_starpu_datawizard_progress(memnode, 1);
-	_STARPU_TRACE_END_PROGRESS(memnode);
-
 	struct starpu_task *task;
-	struct _starpu_job *j = NULL;
+	struct _starpu_job *j;
+
+	task = starpu_task_get_current();
+
+	if (task)
+	{
+		/* On-going asynchronous task, check for its termination first */
+		cudaError_t cures = cudaEventQuery(task_events[workerid]);
+
+		if (cures != cudaSuccess)
+		{
+			/* Not ready yet, no better thing to do than waiting */
+			__starpu_datawizard_progress(memnode, 1, 0);
+
+			STARPU_ASSERT(cures == cudaErrorNotReady);
+			return 0;
+		}
+
+		/* Asynchronous task completed! */
+		finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+	}
+
+	__starpu_datawizard_progress(memnode, 1, 1);
 
 	task = _starpu_get_worker_task(args, workerid, memnode);
 
@@ -462,13 +497,8 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker *args)
 		return 0;
 	}
 
-	_starpu_set_current_task(task);
-	args->current_task = j->task;
-
-	int res = execute_job_on_cuda(j, args);
-
-	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	_STARPU_TRACE_END_PROGRESS(memnode);
+	int res = start_job_on_cuda(j, args);
 
 	if (res)
 	{
@@ -483,7 +513,23 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker *args)
 		}
 	}
 
-	_starpu_handle_job_termination(j);
+#ifndef STARPU_SIMGRID
+	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+		finish_job_on_cuda(j, args);
+	}
+	_STARPU_TRACE_START_PROGRESS(memnode);
 
 	return 0;
 }
@@ -516,8 +562,10 @@ void *_starpu_cuda_worker(void *arg)
 	struct _starpu_worker* args = arg;
 
 	_starpu_cuda_driver_init(args);
+	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 		_starpu_cuda_driver_run_once(args);
+	_STARPU_TRACE_END_PROGRESS(memnode);
 	_starpu_cuda_driver_deinit(args);
 
 	return NULL;

+ 73 - 30
src/drivers/opencl/driver_opencl.c

@@ -50,6 +50,7 @@ static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
+static cl_event task_events[STARPU_MAXOPENCLDEVS];
 #endif
 
 void
@@ -165,7 +166,7 @@ cl_int _starpu_opencl_init_context(int devid)
         err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
-        props &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
         in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
         if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
         out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
@@ -561,7 +562,8 @@ void _starpu_opencl_init(void)
 #ifndef STARPU_SIMGRID
 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
 #endif
-static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args);
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args);
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args);
 
 int _starpu_opencl_driver_init(struct _starpu_worker *args)
 {
@@ -616,13 +618,36 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	struct starpu_task *task;
 	int res;
 
-	_STARPU_TRACE_START_PROGRESS(memnode);
-	_starpu_datawizard_progress(memnode, 1);
+	task = starpu_task_get_current();
+
+	if (task)
+	{
+		cl_int status;
+		size_t size;
+		int err;
+		/* On-going asynchronous task, check for its termination first */
+
+		err = clGetEventInfo(task_events[args->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
+		STARPU_ASSERT(size == sizeof(cl_int));
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+
+		if (status != CL_COMPLETE)
+		{
+			/* Not ready yet, no better thing to do than waiting */
+			__starpu_datawizard_progress(memnode, 1, 0);
+			return 0;
+		}
+
+		/* Asynchronous task completed! */
+		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
+	}
+
+	__starpu_datawizard_progress(memnode, 1, 1);
+
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	task = _starpu_get_worker_task(args, workerid, memnode);
 
-
 	if (task == NULL)
 		return 0;
 
@@ -636,13 +661,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		return 0;
 	}
 
-	_starpu_set_current_task(j->task);
-	args->current_task = j->task;
-
-	res = _starpu_opencl_execute_job(j, args);
-
-	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	res = _starpu_opencl_start_job(j, args);
 
 	if (res)
 	{
@@ -658,7 +677,28 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		}
 	}
 
-	_starpu_handle_job_termination(j);
+#ifndef STARPU_SIMGRID
+	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		int err;
+		cl_command_queue queue;
+		starpu_opencl_get_queue(args->devid, &queue);
+		err = clEnqueueMarker(queue, &task_events[args->devid]);
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+		_starpu_opencl_stop_job(j, args);
+	}
+	_STARPU_TRACE_START_PROGRESS(memnode);
+
 	return 0;
 }
 
@@ -692,9 +732,11 @@ void *_starpu_opencl_worker(void *arg)
 	struct _starpu_worker* args = arg;
 
 	_starpu_opencl_driver_init(args);
+	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 		_starpu_opencl_driver_run_once(args);
 	_starpu_opencl_driver_deinit(args);
+	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	return NULL;
 }
@@ -746,7 +788,7 @@ cl_device_type _starpu_opencl_get_device_type(int devid)
 }
 #endif /* STARPU_USE_OPENCL */
 
-static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 
@@ -754,12 +796,14 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 	struct starpu_task *task = j->task;
 
 	int profiling = starpu_profiling_status_get();
-	struct timespec codelet_start, codelet_end;
 
 	STARPU_ASSERT(task);
 	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
+	_starpu_set_current_task(j->task);
+	args->current_task = j->task;
+
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
 	{
@@ -769,7 +813,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
@@ -780,12 +824,6 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-		{
-			cl_command_queue queue;
-			starpu_opencl_get_queue(args->devid, &queue);
-			clFinish(queue);
-		}
 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -800,23 +838,28 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-		{
-			cl_command_queue queue;
-			starpu_opencl_get_queue(args->devid, &queue);
-			clFinish(queue);
-		}
 #endif
 	}
+	return 0;
+}
+
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	struct timespec codelet_end;
+	int profiling = starpu_profiling_status_get();
+
+	_starpu_set_current_task(NULL);
+	args->current_task = NULL;
 
 	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
 	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
-					   &codelet_start, &codelet_end, profiling);
+					   &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
-	return EXIT_SUCCESS;
+	_starpu_handle_job_termination(j);
+
 }
 
 #ifdef STARPU_USE_OPENCL

+ 2 - 1
tests/datawizard/acquire_release.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,6 +44,7 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},

+ 1 - 0
tests/datawizard/acquire_release2.c

@@ -44,6 +44,7 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},

+ 0 - 2
tests/datawizard/acquire_release_cuda.cu

@@ -26,6 +26,4 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 3
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,4 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/data_invalidation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -45,7 +45,6 @@ static void cuda_memset_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_ar
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -93,6 +92,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_memset_codelet, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_memset_codelet, NULL},

+ 1 - 1
tests/datawizard/handle_to_pointer.c

@@ -50,7 +50,6 @@ static void cuda_task(void **buffers, void *args)
 	{
 		cudaMemcpyAsync(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	}
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -87,6 +86,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_task, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_task, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_task, NULL},

+ 4 - 4
tests/datawizard/increment_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,7 +44,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -56,7 +55,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -119,6 +117,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -133,6 +132,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -182,7 +182,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -198,6 +197,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 4 - 4
tests/datawizard/increment_redux_lazy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -50,7 +49,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -105,6 +103,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -119,6 +118,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -164,7 +164,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -178,6 +177,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 5 - 4
tests/datawizard/increment_redux_v2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2013  Université de Bordeaux 1
+ * Copyright (C) 2011-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,7 +43,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -55,7 +54,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -118,6 +116,7 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
@@ -132,6 +131,7 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
@@ -181,7 +181,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -197,6 +196,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},
@@ -211,6 +211,7 @@ struct starpu_codelet increment_cl_redux =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 1 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c

@@ -38,6 +38,7 @@ void cuda_to_cpu(void *buffers[], void *arg)
 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args); struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1
 };
 

+ 0 - 2
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu

@@ -45,6 +45,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/lazy_allocation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012       inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,6 @@ static void cuda_memset_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_ar
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -88,6 +87,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_memset_codelet, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_memset_codelet, NULL},

+ 2 - 2
tests/datawizard/manual_reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -117,7 +117,6 @@ static void cuda_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	h_val++;
 	cudaMemcpyAsync(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -146,6 +145,7 @@ static struct starpu_codelet use_data_on_worker_codelet =
 	.cpu_funcs = {cpu_func_incr, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_func_incr, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_func_incr, NULL},

+ 2 - 1
tests/datawizard/mpi_like.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -60,6 +60,7 @@ static struct starpu_codelet increment_handle_cl =
 	.cpu_funcs = {increment_handle_cpu_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},

+ 2 - 1
tests/datawizard/mpi_like_async.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -88,6 +88,7 @@ static struct starpu_codelet increment_handle_cl =
 	.cpu_funcs = {increment_handle_cpu_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = { opencl_codelet_unsigned_inc, NULL},

+ 2 - 1
tests/datawizard/partition_lazy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,7 @@ struct starpu_codelet mycodelet =
 #endif
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = { scal_func_cuda, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs_name = {"scal_func_cpu", NULL},
 	.modes = { STARPU_W },

+ 2 - 1
tests/datawizard/scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -95,6 +95,7 @@ struct starpu_codelet scal_codelet =
 #endif
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = { scal_func_cuda, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs_name = {"scal_func_cpu", NULL},
 	.modes = { STARPU_RW },

+ 1 - 3
tests/datawizard/scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +33,4 @@ extern "C" void scal_func_cuda(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 1
tests/datawizard/scratch.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -67,6 +67,7 @@ static struct starpu_codelet cl_f =
 	.cpu_funcs = {cpu_f, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_f, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_f, NULL},

+ 1 - 2
tests/datawizard/scratch_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -47,5 +47,4 @@ extern "C" void cuda_f(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 	
 	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
tests/datawizard/specific_node.c

@@ -60,6 +60,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {specific_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_unsigned_inc, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_unsigned_inc, NULL},

+ 3 - 1
tests/datawizard/sync_and_notify_data.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -105,6 +105,7 @@ int main(int argc, char **argv)
 				.cpu_funcs = {cpu_codelet_incA, NULL},
 #ifdef STARPU_USE_CUDA
 				.cuda_funcs = {cuda_codelet_incA, NULL},
+				.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 				.opencl_funcs = {opencl_codelet_incA, NULL},
@@ -142,6 +143,7 @@ int main(int argc, char **argv)
 				.cpu_funcs = {cpu_codelet_incC, NULL},
 #ifdef STARPU_USE_CUDA
 				.cuda_funcs = {cuda_codelet_incC, NULL},
+				.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 				.opencl_funcs = {opencl_codelet_incC, NULL},

+ 3 - 1
tests/datawizard/sync_and_notify_data_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -78,6 +78,7 @@ static struct starpu_codelet cl_inc_a =
 	.cpu_funcs = {cpu_codelet_incA, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_incA, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_incA, NULL},
@@ -93,6 +94,7 @@ struct starpu_codelet cl_inc_c =
 	.cpu_funcs = {cpu_codelet_incC, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_incC, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_incC, NULL},

+ 1 - 5
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,8 +31,6 @@ extern "C" void cuda_codelet_incA(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /*
@@ -49,6 +47,4 @@ extern "C" void cuda_codelet_incC(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,6 @@ static void cuda_codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cudaMemsetAsync(buf, 42, 1, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -80,6 +79,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_codelet_null, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_codelet_null, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet_null, NULL},

+ 2 - 2
tests/datawizard/wt_broadcast.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,7 +55,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -69,6 +68,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 2 - 2
tests/datawizard/wt_host.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,7 +59,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -75,6 +74,7 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},

+ 1 - 5
tests/main/increment.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Université de Bordeaux 1
+ * Copyright (C) 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,10 +25,6 @@ extern "C" void cuda_host_increment(void *descr[], void *_args)
 {
 	(void)_args;
 	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaError_t cures;
 
 	cuda_increment<<<1,1, 0, starpu_cuda_get_local_stream()>>>(var);
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	if (cures != cudaSuccess)
-		STARPU_CUDA_REPORT_ERROR(cures);
 }

+ 2 - 1
tests/main/subgraph_repeat.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -62,6 +62,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_regenerate.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -62,6 +62,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_regenerate_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,6 +68,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 1
tests/main/subgraph_repeat_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -61,6 +61,7 @@ static struct starpu_codelet dummy_codelet =
 	.cpu_funcs = {cpu_increment, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_host_increment, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	// TODO
 	//.opencl_funcs = {dummy_func, NULL},

+ 2 - 2
tests/microbenchs/matrix_as_vector.c

@@ -50,7 +50,6 @@ void vector_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	sum /= nx;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -82,7 +81,6 @@ void matrix_cuda_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 	sum /= nx*ny;
 
 	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -180,6 +178,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	vector_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) vector_codelet.cpu_funcs[0] = vector_cpu_func;
 	if (where == STARPU_CUDA) vector_codelet.cuda_funcs[0] = vector_cuda_func;
+	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) vector_codelet.opencl_funcs[0] = vector_opencl_func;
 
 	starpu_codelet_init(&matrix_codelet);
@@ -187,6 +186,7 @@ int check_size_on_device(uint32_t where, char *device_name)
 	matrix_codelet.nbuffers = 1;
 	if (where == STARPU_CPU) matrix_codelet.cpu_funcs[0] = matrix_cpu_func;
 	if (where == STARPU_CUDA) matrix_codelet.cuda_funcs[0] = matrix_cuda_func;
+	if (where == STARPU_CUDA) vector_codelet.cuda_flags[0] = STARPU_CUDA_ASYNC;
 //	if (where == STARPU_OPENCL) matrix_codelet.opencl_funcs[0] = matrix_opencl_func;
 
 	for(nx=NX_MIN ; nx<=NX_MAX ; nx*=2)

+ 2 - 2
tests/perfmodels/non_linear_regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -29,7 +29,6 @@ static void memset_cuda(void *descr[], void *arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -57,6 +56,7 @@ static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},

+ 2 - 2
tests/perfmodels/regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2013  Université de Bordeaux 1
+ * Copyright (C) 2011-2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2012 inria
  *
@@ -30,7 +30,6 @@ static void memset_cuda(void *descr[], void *arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -64,6 +63,7 @@ static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {memset_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {memset_opencl, NULL},