Nathalie Furmento лет назад: 11
Родитель
Сommit
d6b0c372cd
100 измененных файлов с 866 добавлено и 527 удалено
  1. 4 0
      ChangeLog
  2. 2 2
      doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
  3. 1 1
      doc/doxygen/chapters/21simgrid.doxy
  4. 3 3
      doc/doxygen/chapters/api/data_management.doxy
  5. 3 3
      doc/doxygen/chapters/api/threads.doxy
  6. 3 3
      doc/doxygen/doxygen.cfg
  7. 2 2
      examples/audio/starpu_audio_processing.c
  8. 3 2
      examples/axpy/axpy.c
  9. 1 6
      examples/axpy/axpy_opencl.c
  10. 2 0
      examples/basic_examples/multiformat_conversion_codelets.c
  11. 0 2
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  12. 1 6
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  13. 1 0
      examples/basic_examples/vector_scal.c
  14. 2 7
      examples/basic_examples/vector_scal_opencl.c
  15. 14 0
      examples/callback/prologue.c
  16. 9 11
      examples/cg/cg_kernels.c
  17. 0 2
      examples/filters/custom_mf/conversion.cu
  18. 1 6
      examples/filters/custom_mf/conversion_opencl.c
  19. 0 2
      examples/filters/custom_mf/cuda.cu
  20. 2 0
      examples/filters/custom_mf/custom_conversion_codelets.c
  21. 2 0
      examples/filters/custom_mf/custom_mf_filter.c
  22. 1 6
      examples/filters/custom_mf/custom_opencl.c
  23. 3 1
      examples/filters/fblock.c
  24. 0 2
      examples/filters/fblock_cuda.cu
  25. 2 7
      examples/filters/fblock_opencl.c
  26. 2 2
      examples/filters/shadow.c
  27. 2 2
      examples/filters/shadow2d.c
  28. 2 3
      examples/filters/shadow3d.c
  29. 4 1
      examples/heat/dw_factolu.c
  30. 1 7
      examples/heat/dw_factolu_kernels.c
  31. 2 0
      examples/interface/complex.c
  32. 0 2
      examples/interface/complex_kernels.cu
  33. 1 6
      examples/interface/complex_kernels_opencl.c
  34. 5 11
      examples/lu/xlu_kernels.c
  35. 4 5
      examples/mandelbrot/mandelbrot.c
  36. 3 7
      examples/matvecmult/matvecmult.c
  37. 2 2
      examples/mult/xgemm.c
  38. 3 3
      examples/pi/pi_redux.c
  39. 8 15
      examples/reductions/dot_product.c
  40. 0 1
      examples/reductions/dot_product_kernels.cu
  41. 3 1
      examples/spmv/spmv.c
  42. 1 4
      examples/spmv/spmv_cuda.cu
  43. 2 7
      examples/spmv/spmv_kernels.c
  44. 2 5
      examples/stencil/life_opencl.c
  45. 2 5
      examples/stencil/shadow_opencl.c
  46. 9 31
      examples/stencil/stencil-kernels.c
  47. 2 1
      include/starpu_perfmodel.h
  48. 5 0
      include/starpu_task.h
  49. 17 15
      include/starpu_task_util.h
  50. 1 1
      libstarpu-mic.pc.in
  51. 1 1
      libstarpu.pc.in
  52. 4 4
      mpi/src/starpu_mpi.c
  53. 50 0
      mpi/src/starpu_mpi_task_insert.c
  54. 1 1
      sc_hypervisor/src/policies_utils/lp_programs.c
  55. 3 3
      src/common/fxt.h
  56. 1 1
      src/core/jobs.h
  57. 1 1
      src/core/perfmodel/perfmodel_history.c
  58. 94 34
      src/core/sched_ctx.c
  59. 0 4
      src/core/sched_ctx.h
  60. 3 0
      src/core/sched_policy.c
  61. 4 0
      src/core/task.c
  62. 24 16
      src/core/topology.c
  63. 40 33
      src/core/workers.c
  64. 4 0
      src/core/workers.h
  65. 7 1
      src/datawizard/coherency.c
  66. 7 7
      src/datawizard/copy_driver.c
  67. 24 14
      src/datawizard/data_request.c
  68. 5 5
      src/datawizard/data_request.h
  69. 26 7
      src/datawizard/datawizard.c
  70. 2 1
      src/datawizard/datawizard.h
  71. 1 1
      src/datawizard/filters.c
  72. 1 1
      src/datawizard/footprint.c
  73. 1 1
      src/datawizard/interfaces/data_interface.c
  74. 39 15
      src/datawizard/memalloc.c
  75. 8 6
      src/datawizard/memalloc.h
  76. 3 3
      src/datawizard/memory_nodes.c
  77. 9 2
      src/debug/traces/starpu_fxt.c
  78. 1 1
      src/drivers/cpu/driver_cpu.c
  79. 185 77
      src/drivers/cuda/driver_cuda.c
  80. 7 7
      src/drivers/cuda/driver_cuda.h
  81. 1 0
      src/drivers/driver_common/driver_common.c
  82. 10 1
      src/drivers/mic/driver_mic_source.c
  83. 79 31
      src/drivers/opencl/driver_opencl.c
  84. 1 1
      src/drivers/scc/driver_scc_source.c
  85. 36 1
      src/util/starpu_task_insert_utils.c
  86. 1 1
      starpu-1.0-mic.pc.in
  87. 1 1
      starpu-1.0.pc.in
  88. 1 1
      starpu-1.1.pc.in
  89. 1 1
      starpu-1.2.pc.in
  90. 8 0
      tests/Makefile.am
  91. 3 1
      tests/datawizard/acquire_release.c
  92. 2 0
      tests/datawizard/acquire_release2.c
  93. 0 2
      tests/datawizard/acquire_release_cuda.cu
  94. 1 6
      tests/datawizard/acquire_release_opencl.c
  95. 1 3
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  96. 3 3
      tests/datawizard/data_invalidation.c
  97. 2 2
      tests/datawizard/handle_to_pointer.c
  98. 7 7
      tests/datawizard/increment_redux.c
  99. 7 7
      tests/datawizard/increment_redux_lazy.c
  100. 0 0
      tests/datawizard/increment_redux_v2.c

+ 4 - 0
ChangeLog

@@ -51,6 +51,8 @@ New features:
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
   * Add paje traces statistics tools.
+  * Add CUDA concurrent kernel execution support through
+    the STARPU_NWORKER_PER_CUDA environment variable.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
@@ -91,6 +93,8 @@ Changes:
   * StarPU-MPI: Fix for being able to receive data with the same tag
     from several nodes (see mpi/tests/gather.c)
   * StarPU-MPI: Fix overzealous allocation of memory.
+  * Interfaces: Allow interface implementation to change pointers at will, in
+    unpack notably.
 
 Small changes:
   * Rename function starpu_trace_user_event() as

+ 2 - 2
doc/doxygen/chapters/10scheduling_context_hypervisor.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -192,7 +192,7 @@ The <b>Throughput </b> strategy focuses on maximizing the throughput of the reso
 and resizes the contexts such that the machine is running at its maximum efficiency
 (maximum instant speed of the workers).
 
-\section  Defining a new hypervisor policy
+\section DefiningANewHypervisorPolicy Defining A New Hypervisor Policy
 
 While Scheduling Context Hypervisor Plugin comes with a variety of
 resizing policies (see \ref ResizingStrategies), it may sometimes be

+ 1 - 1
doc/doxygen/chapters/21simgrid.doxy

@@ -22,7 +22,7 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 
 For some technical reason, the application's .c file which contains main() has
-to be recompiled with starpu.h, which in the simgrid case will #define main()
+to be recompiled with starpu.h, which in the simgrid case will # define main()
 into starpu_main(), and it is libstarpu which will provide the real main() and
 call the application's main().
 

+ 3 - 3
doc/doxygen/chapters/api/data_management.doxy

@@ -163,9 +163,9 @@ will commit their changes in main memory (node 0).
 Issue a prefetch request for a given data to a given node, i.e.
 requests that the data be replicated to the given node, so that it is
 available there for tasks. If the \p async parameter is 0, the call will
-block until the transfer is achieved, else the call will return as
-soon as the request is scheduled (which may however have to wait for a
-task completion).
+block until the transfer is achieved, else the call will return immediately,
+after having just queued the request. In the latter case, the request will
+asynchronously wait for the completion of any task writing on the data.
 
 \fn starpu_data_handle_t starpu_data_lookup(const void *ptr)
 \ingroup API_Data_Management

+ 3 - 3
doc/doxygen/chapters/api/threads.doxy

@@ -152,7 +152,7 @@ This function initializes the thread attributes object pointed to by
 \p attr with default attribute values.
 
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 \fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
 \ingroup API_Threads
@@ -161,7 +161,7 @@ required. Destroying a thread attributes object has no effect on
 threads that were created using that object.
 
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 \fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
 \ingroup API_Threads
@@ -172,7 +172,7 @@ created using the thread attributes object \p attr will be created in
 a joinable or a detached state.
 
 It does not do anything when the simulated performance mode is enabled
-(\ref SimulatedPerformance).
+(\ref SimGridSupport).
 
 \fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
 \ingroup API_Threads

+ 3 - 3
doc/doxygen/doxygen.cfg

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
@@ -645,13 +645,13 @@ QUIET                  = YES
 # generated by doxygen. Possible values are YES and NO. If left blank
 # NO is used.
 
-WARNINGS               = YES
+WARNINGS               = NO
 
 # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
 # for undocumented members. If EXTRACT_ALL is set to YES then this flag will
 # automatically be disabled.
 
-WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_UNDOCUMENTED   = NO
 
 # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
 # potential errors in the documentation, such as not documenting some

+ 2 - 2
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -211,7 +211,6 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -284,6 +283,7 @@ static struct starpu_codelet band_filter_cl =
 	.modes = { STARPU_RW },
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {band_filter_kernel_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.cpu_funcs = {band_filter_kernel_cpu, NULL},
 	.model = &band_filter_model,

+ 3 - 2
examples/axpy/axpy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -71,7 +71,6 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -84,9 +83,11 @@ static struct starpu_codelet axpy_cl =
 	.cpu_funcs = {axpy_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_gpu, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {axpy_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},

+ 1 - 6
examples/axpy/axpy_opencl.c

@@ -60,14 +60,9 @@ void axpy_opencl(void *buffers[], void *_args)
                 if (local > global)
 			local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -37,6 +37,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"
 };
@@ -68,6 +69,7 @@ extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_opencl_cl =
 {
 	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 	.nbuffers = 1
 };
 

+ 0 - 2
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,6 +44,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 6
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -84,15 +84,10 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 					&local,
 					0,
 					NULL,
-					&event);
+					NULL);
 
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 1 - 0
examples/basic_examples/vector_scal.c

@@ -88,6 +88,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_OPENCL
 	/* OpenCL implementation of the codelet */
 	.opencl_funcs = {scal_opencl_func, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_RW},

+ 2 - 7
examples/basic_examples/vector_scal_opencl.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -61,13 +61,8 @@ void scal_opencl_func(void *buffers[], void *_args)
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 14 - 0
examples/callback/prologue.c

@@ -17,6 +17,7 @@
 
 #include <starpu.h>
 #include <sys/time.h>
+#include <omp.h>
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
@@ -27,6 +28,7 @@ void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	*val += 1;
+	printf("task executing \n");
 }
 
 struct starpu_codelet cl =
@@ -55,6 +57,12 @@ void prologue_callback_func(void *callback_arg)
 	printf("x = %lf\n", *x);
 }
 
+void pop_prologue_callback_func(void *args)
+{
+	unsigned val = (unsigned) args;
+	printf("pop_prologue_callback val %d \n", val);
+}
+
 
 int main(int argc, char **argv)
 {
@@ -72,6 +80,10 @@ int main(int argc, char **argv)
 	task->cl = &cl;
 	task->prologue_callback_func = callback_func;
 	task->prologue_callback_arg = NULL;
+
+	task->prologue_callback_pop_func = pop_prologue_callback_func;
+	task->prologue_callback_pop_arg = (void*) 5;
+
 	task->handles[0] = handle;
 
 	ret = starpu_task_submit(task);
@@ -84,6 +96,8 @@ int main(int argc, char **argv)
 				      STARPU_RW, handle,
 				      STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
 				      STARPU_PROLOGUE_CALLBACK_ARG, x,
+				      STARPU_PROLOGUE_CALLBACK_POP, pop_prologue_callback_func,
+				      STARPU_PROLOGUE_CALLBACK_POP_ARG, 5,
 				      0);
 
 

+ 9 - 11
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -73,7 +73,6 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  
 	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -97,6 +96,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -111,7 +111,6 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -136,6 +135,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -154,8 +154,6 @@ static void bzero_variable_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	zero_vector(v, 1);
- 
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -177,6 +175,7 @@ struct starpu_codelet bzero_variable_cl =
 	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_variable_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -190,8 +189,6 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
 	zero_vector(v, n);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -215,6 +212,7 @@ struct starpu_codelet bzero_vector_cl =
 	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_vector_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
@@ -322,7 +320,6 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 	/* v1 = p1 v1 */
 	TYPE alpha = p1;
 	cublasscal(n, alpha, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -350,6 +347,7 @@ static struct starpu_codelet scal_kernel_cl =
 	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
 	.model = &scal_kernel_model
@@ -375,7 +373,6 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -425,6 +422,7 @@ static struct starpu_codelet gemv_kernel_cl =
 	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {gemv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
@@ -488,7 +486,6 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 	 */
 	cublasscal(n, p1, v1, 1);
 	cublasaxpy(n, p2, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -522,6 +519,7 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
@@ -565,7 +563,6 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 	/* Compute v1 = v1 + p1 * v2.
 	 */
 	cublasaxpy(n, p1, v2, 1, v1, 1);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -596,6 +593,7 @@ static struct starpu_codelet axpy_kernel_cl =
 	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.model = &axpy_kernel_model

+ 0 - 2
examples/filters/custom_mf/conversion.cu

@@ -45,6 +45,4 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 6
examples/filters/custom_mf/conversion_opencl.c

@@ -87,15 +87,10 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 				&local,  /* local_work_size */
 				0,       /* num_events_in_wait_list */
 				NULL,    /* event_wait_list */
-				&event);
+				NULL);
 
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 0 - 2
examples/filters/custom_mf/cuda.cu

@@ -39,6 +39,4 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -40,6 +40,7 @@ extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"
@@ -76,6 +77,7 @@ extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
 struct starpu_codelet cpu_to_opencl_cl =
 {
 	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_opencl"

+ 2 - 0
examples/filters/custom_mf/custom_mf_filter.c

@@ -158,6 +158,7 @@ static struct starpu_codelet cpu_cl =
 static struct starpu_codelet cuda_cl =
 {
 	.cuda_funcs = { custom_scal_cuda_func, NULL },
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
 	.name = "cuda_codelet"
@@ -170,6 +171,7 @@ extern void custom_scal_opencl_func(void *buffers[], void *args);
 static struct starpu_codelet opencl_cl =
 {
 	.opencl_funcs = { custom_scal_opencl_func, NULL },
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
 	.name = "opencl_codelet"

+ 1 - 6
examples/filters/custom_mf/custom_opencl.c

@@ -86,15 +86,10 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 				&local,  /* local_work_size */
 				0,       /* num_events_in_wait_list */
 				NULL,    /* event_wait_list */
-				&event);
+				NULL);
 
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 3 - 1
examples/filters/fblock.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -95,9 +95,11 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
                 .opencl_funcs = {opencl_func, NULL},
+		.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 		.nbuffers = 1,
                 .modes = {STARPU_RW},

+ 0 - 2
examples/filters/fblock_cuda.cu

@@ -41,6 +41,4 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 7
examples/filters/fblock_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -61,14 +61,9 @@ void opencl_func(void *buffers[], void *cl_arg)
 
 	{
 		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }
 

+ 2 - 2
examples/filters/shadow.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -85,7 +85,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	/* If things go right, sizes should match */
 	STARPU_ASSERT(n == n2);
 	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -103,6 +102,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 2
examples/filters/shadow2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -139,7 +139,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 	STARPU_ASSERT(n == n2);
 	STARPU_ASSERT(m == m2);
 	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -157,6 +156,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 2 - 3
examples/filters/shadow3d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -105,8 +105,6 @@ void cuda_func(void *buffers[], void *cl_arg)
 				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 		STARPU_ASSERT(!cures);
 	}
-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
-	STARPU_ASSERT(!cures);
 }
 #endif
 
@@ -124,6 +122,7 @@ int main(int argc, char **argv)
                 .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
                 .nbuffers = 2,
 		.modes = {STARPU_R, STARPU_W}

+ 4 - 1
examples/heat/dw_factolu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -55,6 +55,7 @@ static struct starpu_codelet cl12 =
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -66,6 +67,7 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -77,6 +79,7 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 1 - 7
examples/heat/dw_factolu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,8 +137,6 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -202,8 +200,6 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -264,8 +260,6 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:

+ 2 - 0
examples/interface/complex.c

@@ -53,9 +53,11 @@ struct starpu_codelet cl_copy =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {copy_complex_codelet_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {copy_complex_codelet_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W},

+ 0 - 2
examples/interface/complex_kernels.cu

@@ -44,6 +44,4 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 6
examples/interface/complex_kernels_opencl.c

@@ -66,14 +66,9 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
                 if (local > global)
 			local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 5 - 11
examples/lu/xlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,9 +70,6 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 		}
 #endif
@@ -133,6 +130,7 @@ struct starpu_codelet cl22 =
 	.cpu_funcs = {STARPU_LU(cpu_u22), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u22), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 3,
@@ -180,9 +178,6 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
 			break;
 #endif
 		default:
@@ -221,6 +216,7 @@ struct starpu_codelet cl12 =
 	.cpu_funcs = {STARPU_LU(cpu_u12), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u12), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -266,8 +262,6 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -306,6 +300,7 @@ struct starpu_codelet cl21 =
 	.cpu_funcs = {STARPU_LU(cpu_u21), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_u21), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 2,
@@ -596,8 +591,6 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 			break;
 #endif
 		default:
@@ -637,6 +630,7 @@ struct starpu_codelet cl_pivot =
 	.cpu_funcs = {STARPU_LU(cpu_pivot), NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {STARPU_LU(cublas_pivot), NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	CAN_EXECUTE
 #endif
 	.nbuffers = 1,

+ 4 - 5
examples/mandelbrot/mandelbrot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -261,10 +261,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 	unsigned dim = 16;
 	size_t local[2] = {dim, 1};
 	size_t global[2] = {width, block_size};
-	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &event);
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
+	clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
 	starpu_opencl_release_kernel(kernel);
 }
 #endif
@@ -378,6 +375,7 @@ static struct starpu_codelet spmd_mandelbrot_cl =
 	.cpu_funcs = {compute_block_spmd, NULL},
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {compute_block_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 1
 };
@@ -388,6 +386,7 @@ static struct starpu_codelet mandelbrot_cl =
 	.cpu_funcs = {compute_block, NULL},
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {compute_block_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 1
 };

+ 3 - 7
examples/matvecmult/matvecmult.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -50,14 +50,9 @@ void opencl_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
 	{
 		size_t global=nx*ny;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
 	starpu_opencl_release_kernel(kernel);
 }
 #endif
@@ -130,6 +125,7 @@ static struct starpu_codelet cl =
 {
 #ifdef STARPU_USE_OPENCL
         .opencl_funcs[0] = opencl_codelet,
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
         .nbuffers = 3,
 	.modes[0] = STARPU_R,

+ 2 - 2
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -177,7 +177,6 @@ static void mult_kernel_common(void *descr[], int type)
 	{
 		CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
 					     (TYPE)0.0, subC, ldC);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	}
 #endif
 }
@@ -207,6 +206,7 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_mult, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_mult, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},

+ 3 - 3
examples/pi/pi_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -252,7 +252,6 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         cudaMemsetAsync(val, 0, sizeof(unsigned long), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -262,6 +261,7 @@ static struct starpu_codelet init_codelet =
         .cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
         .cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_W},
         .nbuffers = 1
@@ -283,7 +283,6 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 	h_a += h_b;
 
 	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -301,6 +300,7 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2

+ 8 - 15
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -75,7 +75,6 @@ void init_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -111,6 +110,7 @@ static struct starpu_codelet init_codelet =
 	.cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {init_opencl_func, NULL},
@@ -174,15 +174,10 @@ void redux_opencl_func(void *buffers[], void *args)
                 if (local > global)
 			local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }
 #endif
@@ -194,9 +189,11 @@ static struct starpu_codelet redux_codelet =
 	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_func, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
@@ -247,7 +244,6 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 	current_dot += local_dot;
 
 	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -293,15 +289,10 @@ void dot_opencl_func(void *buffers[], void *args)
                 if (local > global)
 			local=global;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }
 #endif
@@ -313,9 +304,11 @@ static struct starpu_codelet dot_codelet =
 	.cpu_funcs_name = {"dot_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dot_opencl_func, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_REDUX},

+ 0 - 1
examples/reductions/dot_product_kernels.cu

@@ -31,5 +31,4 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 1
examples/spmv/spmv.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -98,9 +98,11 @@ static struct starpu_codelet spmv_cl =
 	.cpu_funcs = {spmv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {spmv_kernel_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
         .opencl_funcs = {spmv_kernel_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_W},

+ 1 - 4
examples/spmv/spmv_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -98,9 +98,6 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
 }
 
 

+ 2 - 7
examples/spmv/spmv_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -90,14 +90,9 @@ void spmv_kernel_opencl(void *descr[], void *args)
 
 	{
                 size_t global=nrow;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
         starpu_opencl_release_kernel(kernel);
 }
 

+ 2 - 5
examples/stencil/life_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -109,8 +109,5 @@ opencl_life_update_host(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int
   clSetKernelArg(kernel, 8, sizeof(iter), &iter);
 
   cl_event ev;
-  clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, &ev);
-  clWaitForEvents(1, &ev);
-  starpu_opencl_collect_stats(ev);
-  clReleaseEvent(ev);
+  clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL);
 }

+ 2 - 5
examples/stencil/shadow_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -105,11 +105,8 @@ opencl_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int ldy, int ldz,
         clSetKernelArg(kernel, 7, sizeof(i), &i);
 
         cl_event ev;
-        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, &ev);
+        cl_int err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, dim, NULL, 0, NULL, NULL);
         if (err != CL_SUCCESS)
                 STARPU_OPENCL_REPORT_ERROR(err);
-        clWaitForEvents(1, &ev);
-        starpu_opencl_collect_stats(ev);
-        clReleaseEvent(ev);
 }
 

+ 9 - 31
examples/stencil/stencil-kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -264,10 +264,6 @@ static void update_func_cuda(void *descr[], void *arg)
 #endif /* LIFE */
 	}
 
-	cudaError_t cures;
-	if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
-		STARPU_CUDA_REPORT_ERROR(cures);
-
 	if (block->bz == 0)
 		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
@@ -293,11 +289,8 @@ static void load_subblock_from_buffer_opencl(struct starpu_block_interface *bloc
 
         cl_command_queue cq;
         starpu_opencl_get_current_queue(&cq);
-        cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, &event);
+        cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
 	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
-
-	clWaitForEvents(1, &event);
-	clReleaseEvent(event);
 }
 
 /*
@@ -362,17 +355,9 @@ static void update_func_opencl(void *descr[], void *arg)
                 cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
 		if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
 
-		clWaitForEvents(1, &event);
-		clReleaseEvent(event);
 #endif /* LIFE */
 	}
 
-#ifndef LIFE
-	cl_int err;
-	if ((err = clFinish(cq)))
-		STARPU_OPENCL_REPORT_ERROR(err);
-#endif
-
 	if (block->bz == 0)
 		starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
 }
@@ -465,9 +450,11 @@ struct starpu_codelet cl_update =
 	.cpu_funcs_name = {"update_func_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {update_func_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {update_func_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.model = &cl_update_model,
 	.nbuffers = 6,
@@ -535,11 +522,8 @@ static void load_subblock_into_buffer_opencl(struct starpu_block_interface *bloc
         starpu_opencl_get_current_queue(&cq);
 	cl_event event;
 
-        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, &event);
+        cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
 	if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
-
-	clWaitForEvents(1, &event);
-	clReleaseEvent(event);
 }
 #endif /* STARPU_USE_OPENCL */
 
@@ -591,7 +575,6 @@ static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 /* bottom save, CUDA version */
@@ -605,7 +588,6 @@ static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *
 
 	load_subblock_into_buffer_cuda(descr[0], descr[2], K);
 	load_subblock_into_buffer_cuda(descr[1], descr[3], K);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif /* STARPU_USE_CUDA */
 
@@ -624,10 +606,6 @@ static void dummy_func_top_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *a
 
 	load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
 	load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
-
-        cl_command_queue cq;
-        starpu_opencl_get_current_queue(&cq);
-        clFinish(cq);
 }
 
 /* bottom save, OPENCL version */
@@ -641,10 +619,6 @@ static void dummy_func_bottom_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void
 
 	load_subblock_into_buffer_opencl(descr[0], descr[2], K);
 	load_subblock_into_buffer_opencl(descr[1], descr[3], K);
-
-        cl_command_queue cq;
-        starpu_opencl_get_current_queue(&cq);
-        clFinish(cq);
 }
 #endif /* STARPU_USE_OPENCL */
 
@@ -667,9 +641,11 @@ struct starpu_codelet save_cl_bottom =
 	.cpu_funcs_name = {"dummy_func_bottom_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_bottom_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.model = &save_cl_bottom_model,
 	.nbuffers = 4,
@@ -682,9 +658,11 @@ struct starpu_codelet save_cl_top =
 	.cpu_funcs_name = {"dummy_func_top_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_top_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {dummy_func_top_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.model = &save_cl_top_model,
 	.nbuffers = 4,

+ 2 - 1
include/starpu_perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -107,6 +107,7 @@ struct starpu_perfmodel_per_arch
 
 enum starpu_perfmodel_type
 {
+        STARPU_PERFMODEL_INVALID=0,
 	STARPU_PER_ARCH,
 	STARPU_COMMON,
 	STARPU_HISTORY_BASED,

+ 5 - 0
include/starpu_task.h

@@ -142,6 +142,11 @@ struct starpu_task
 	/* must StarPU release prologue_callback_arg ? - 0 by default */
 	unsigned prologue_callback_arg_free;
 
+	void (*prologue_callback_pop_func)(void *);
+	void *prologue_callback_pop_arg;
+	/* must StarPU release prologue_callback_pop_arg ? - 0 by default */
+	unsigned prologue_callback_pop_arg_free;
+
 	unsigned use_tag;
 	starpu_tag_t tag_id;
 

+ 17 - 15
include/starpu_task_util.h

@@ -32,21 +32,23 @@ extern "C"
 
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
-#define STARPU_VALUE		 (1<<16)
-#define STARPU_CALLBACK		 (2<<16)
-#define STARPU_CALLBACK_WITH_ARG (3<<16)
-#define STARPU_CALLBACK_ARG	 (4<<16)
-#define STARPU_PRIORITY		 (5<<16)
-#define STARPU_EXECUTE_ON_NODE	 (6<<16)
-#define STARPU_EXECUTE_ON_DATA	 (7<<16)
-#define STARPU_DATA_ARRAY        (8<<16)
-#define STARPU_TAG               (9<<16)
-#define STARPU_HYPERVISOR_TAG	 (10<<16)
-#define STARPU_FLOPS	         (11<<16)
-#define STARPU_SCHED_CTX	 (12<<16)
-#define STARPU_PROLOGUE_CALLBACK   (13<<16)
-#define STARPU_PROLOGUE_CALLBACK_ARG (14<<16)
-#define STARPU_EXECUTE_ON_WORKER (15<<16)
+#define STARPU_VALUE		 (1<<18)
+#define STARPU_CALLBACK		 (2<<18)
+#define STARPU_CALLBACK_WITH_ARG (3<<18)
+#define STARPU_CALLBACK_ARG	 (4<<18)
+#define STARPU_PRIORITY		 (5<<18)
+#define STARPU_EXECUTE_ON_NODE	 (6<<18)
+#define STARPU_EXECUTE_ON_DATA	 (7<<18)
+#define STARPU_DATA_ARRAY        (8<<18)
+#define STARPU_TAG               (9<<18)
+#define STARPU_HYPERVISOR_TAG	 (10<<18)
+#define STARPU_FLOPS	         (11<<18)
+#define STARPU_SCHED_CTX	 (12<<18)
+#define STARPU_PROLOGUE_CALLBACK   (13<<18)
+#define STARPU_PROLOGUE_CALLBACK_ARG (14<<18)
+#define STARPU_PROLOGUE_CALLBACK_POP   (15<<18)
+#define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<18)
+#define STARPU_EXECUTE_ON_WORKER (17<<18)
 
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
 int starpu_task_insert(struct starpu_codelet *cl, ...);

+ 1 - 1
libstarpu-mic.pc.in

@@ -22,7 +22,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@

+ 1 - 1
libstarpu.pc.in

@@ -22,7 +22,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@

+ 4 - 4
mpi/src/starpu_mpi.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -934,7 +934,7 @@ static void _starpu_mpi_copy_cb(void* arg)
 		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
 		STARPU_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
-		itf_dst->unpack_data(args->data_handle, 0, args->buffer, itf_src->get_size(args->copy_handle));
+		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->copy_handle));
 		free(args->buffer);
 	}
 	else
@@ -946,12 +946,12 @@ static void _starpu_mpi_copy_cb(void* arg)
 		if (!itf->copy_methods->ram_to_ram)
 		{
 			_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
-			itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
+			itf->copy_methods->any_to_any(itf_src, STARPU_MAIN_RAM, itf_dst, STARPU_MAIN_RAM, NULL);
 		}
 		else
 		{
 			_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
-			itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
+			itf->copy_methods->ram_to_ram(itf_src, STARPU_MAIN_RAM, itf_dst, STARPU_MAIN_RAM);
 		}
 	}
 

+ 50 - 0
mpi/src/starpu_mpi_task_insert.c

@@ -30,6 +30,8 @@
 #include <starpu_mpi_task_insert.h>
 #include <starpu_mpi_cache.h>
 
+typedef void (*_starpu_callback_func_t)(void *);
+
 static
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 {
@@ -279,6 +281,22 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
+                {
+                        (void)va_arg(varg_list, _starpu_callback_func_t);
+		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+                {
+			(void)va_arg(varg_list, _starpu_callback_func_t);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			va_arg(varg_list_copy, int);
@@ -411,6 +429,22 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		{
 			va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
+                {
+                        (void)va_arg(varg_list, _starpu_callback_func_t);
+		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+                {
+			(void)va_arg(varg_list, _starpu_callback_func_t);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			va_arg(varg_list_copy, int);
@@ -535,6 +569,22 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, struct starpu_codelet *codelet,
 		{
 			va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
+                {
+                        (void)va_arg(varg_list, _starpu_callback_func_t);
+		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+                {
+			(void)va_arg(varg_list, _starpu_callback_func_t);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+                {
+                        (void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			va_arg(varg_list_copy, int);

+ 1 - 1
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -252,7 +252,7 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], 
 					       int  total_nw[nw], unsigned sched_ctxs[ns], double last_vmax)
 {
-	int integer = 0;
+	int integer = 1;
 	int s, w;
 	glp_prob *lp;
 

+ 3 - 3
src/common/fxt.h

@@ -291,8 +291,8 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode)	\
 	FUT_DO_PROBE5(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, _starpu_gettid());
 
-#define _STARPU_TRACE_WORKER_INIT_END				\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid());
+#define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
+	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
 #define _STARPU_TRACE_START_CODELET_BODY(job)				\
 do {									\
@@ -655,7 +655,7 @@ do {										\
 /* Dummy macros in case FxT is disabled */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
-#define _STARPU_TRACE_WORKER_INIT_END		do {} while(0)
+#define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
 #define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)

+ 1 - 1
src/core/jobs.h

@@ -123,7 +123,7 @@ LIST_TYPE(_starpu_job,
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	unsigned reduction_task;
 
-	/* Used by MIC driver to record codelet start time instead of using a
+	/* Used to record codelet start time instead of using a
 	 * local variable */
 	struct timespec cl_start;
 

+ 1 - 1
src/core/perfmodel/perfmodel_history.c

@@ -56,7 +56,7 @@ size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_p
 {
 	struct starpu_task *task = j->task;
 
-	if (model && model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
+	if (model && model->per_arch && model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
 	{
 		return model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
 	}

+ 94 - 34
src/core/sched_ctx.c

@@ -278,7 +278,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	if (sched_ctx->min_priority_is_set) sched_ctx->min_priority = min_prio;
 	sched_ctx->max_priority_is_set = max_prio_set;
 	if (sched_ctx->max_priority_is_set) sched_ctx->max_priority = max_prio;
-	sem_init(&sched_ctx->parallel_code_sem, 0, 0);
+
 
 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
 	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
@@ -560,7 +560,6 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 	sched_ctx->sched_policy = NULL;
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
-	sem_destroy(&sched_ctx->parallel_code_sem);
 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_free(sched_ctx->hwloc_workers_set);
@@ -1587,12 +1586,13 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 	}
 
+	struct _starpu_worker *master_worker = _starpu_get_worker_struct(master);
 	int workerid;
 	for(w = 0; w < nworkers; w++)
 	{
 		workerid = workerids[w];
 		if(current_worker_id == -1 || workerid != current_worker_id)
-			sem_wait(&sched_ctx->parallel_code_sem);
+			sem_wait(&master_worker->parallel_code_sem);
 	}
 	return;
 }
@@ -1600,13 +1600,14 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 void _starpu_sched_ctx_signal_worker_blocked(int workerid)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_worker *master_worker = _starpu_get_worker_struct(worker->master);
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx_list *l = NULL;
 	for (l = worker->sched_ctx_list; l; l = l->next)
 	{
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		if(sched_ctx->id != 0)
-			sem_post(&sched_ctx->parallel_code_sem);
+			sem_post(&master_worker->parallel_code_sem);
 	}	
 	return;
 }
@@ -1645,7 +1646,7 @@ void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, uns
 {
 	int *workerids;
 	int nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
-	int master = starpu_sched_ctx_book_workers_for_task(sched_ctx_id, workerids, nworkers);
+	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, workerids, nworkers, -1);
 
 	/* bind current thread on all workers of the context */
 	_starpu_sched_ctx_bind_thread_to_ctx_cpus(sched_ctx_id);
@@ -1654,7 +1655,7 @@ void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, uns
 	void* ret = func(param);
 
 	/* wake up starpu workers */
-	starpu_sched_ctx_unbook_workers_for_task(sched_ctx_id, master);
+	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, -1);
 
 	return ret;
 }
@@ -1685,44 +1686,103 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	return;
 }
 
-int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers)
+/* int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers) */
+/* { */
+/* 	int current_worker_id = starpu_worker_get_id(); */
+
+/* 	int final_workerids[nworkers]; */
+/* 	int nfinal_workerids = 0; */
+/* 	int w; */
+/* 	int master = -1; */
+/* 	for(w = 0; w < nworkers; w++) */
+/* 	{ */
+/* 		if(current_worker_id == -1) */
+/* 		{ */
+/* 			final_workerids[nfinal_workerids++] = workerids[w];                           */
+/* 			if(nfinal_workerids == nworkers - 1)                          */
+/* 			{ */
+/* 				master = workerids[nfinal_workerids];   */
+/* 				break;   */
+/* 			} */
+/* 		} */
+/* 		else */
+/* 		{ */
+/* 			if(workerids[w] != current_worker_id) */
+/* 				final_workerids[nfinal_workerids++] = workerids[w]; */
+/* 			else */
+/* 			{ */
+/* 				if(nfinal_workerids == nworkers - 1) */
+/* 				{ */
+/* 					master = workerids[nfinal_workerids]; */
+/* 					break; */
+/* 				} */
+/* 				else */
+/* 					master = current_worker_id; */
+/* 			}	 */
+/* 		} */
+/* 	} */
+/* 	if(master == -1 && nfinal_workerids > 0) */
+/* 	{ */
+/* 		nfinal_workerids--; */
+/* 		master = final_workerids[nfinal_workerids]; */
+/* 	} */
+/* 	/\* get starpu workers to sleep *\/ */
+/* 	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, final_workerids, nfinal_workerids, master); */
+
+/* 	/\* bind current thread on all workers of the context *\/ */
+/* //	_starpu_sched_ctx_bind_thread_to_ctx_cpus(sched_ctx_id); */
+/* 	return master; */
+/* } */
+
+static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *workerids, int nworkers)
 {
 	int current_worker_id = starpu_worker_get_id();
-
-	int final_workerids[nworkers];
-	int nfinal_workerids = 0;
+	
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	int w;
-	int master = -1;
+	struct _starpu_worker *worker = NULL;
 	for(w = 0; w < nworkers; w++)
 	{
-		if(current_worker_id == -1)
+		worker = _starpu_get_worker_struct(workerids[w]);
+		if(current_worker_id == -1 || worker->workerid != current_worker_id)
 		{
-			final_workerids[nfinal_workerids++] = workerids[w];
-			if(nfinal_workerids == nworkers)
-				master = workerids[nfinal_workerids-1];
+			STARPU_PTHREAD_MUTEX_LOCK(&worker->parallel_sect_mutex);
+			STARPU_PTHREAD_COND_SIGNAL(&worker->parallel_sect_cond);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->parallel_sect_mutex);
 		}
 		else
-		{
-			if(workerids[w] != current_worker_id)
-				final_workerids[nfinal_workerids++] = workerids[w];
-			else
-			{
-				if(nfinal_workerids == nworkers - 1)
-				{
-					master = workerids[nfinal_workerids];
-					break;
-				}
-				else
-					master = current_worker_id;
-			}	
-		}
+			worker->parallel_sect = 0;
 	}
-	/* get starpu workers to sleep */
-	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, final_workerids, nfinal_workerids, master);
+	return;
+}
 
-	/* bind current thread on all workers of the context */
-//	_starpu_sched_ctx_bind_thread_to_ctx_cpus(sched_ctx_id);
-	return master;
+
+int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers)
+{ 
+	int new_master = workerids[nworkers-1];
+	int w;
+	int nput_to_sleep = 0;
+	int nwake_up = 0;
+	int put_to_sleep[nworkers];
+	int wake_up[nworkers];
+	
+	for(w = 0 ; w < nworkers ; w++)
+	{
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[w]);
+		if (worker->master == -1 && workerids[w] != new_master)
+			put_to_sleep[nput_to_sleep++] = workerids[w];
+		else if(worker->master != -1 && workerids[w] == new_master)
+			wake_up[nwake_up++] = workerids[w];
+		
+		if (workerids[w] != new_master)
+			worker->master = new_master;
+		else
+			worker->master = -1;
+	}
+	_starpu_sched_ctx_wake_these_workers_up(sched_ctx_id, wake_up, nwake_up);
+	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, put_to_sleep, nput_to_sleep, new_master);
+	
+	return new_master;
 }
 
 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)

+ 0 - 4
src/core/sched_ctx.h

@@ -103,10 +103,6 @@ struct _starpu_sched_ctx
      	int min_priority_is_set;
 	int max_priority_is_set;
 
-	/* semaphore that block appl thread until threads are ready 
-	   to exec the parallel code */
-	sem_t parallel_code_sem;
-
 	/* hwloc tree structure of workers */
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_t hwloc_workers_set;

+ 3 - 0
src/core/sched_policy.c

@@ -850,6 +850,9 @@ profiling:
 		}
 	}
 
+	if(task->prologue_callback_pop_func)
+		task->prologue_callback_pop_func(task->prologue_callback_pop_arg);
+
 	return task;
 }
 

+ 4 - 0
src/core/task.c

@@ -172,6 +172,10 @@ void _starpu_task_destroy(struct starpu_task *task)
 		if (task->prologue_callback_arg_free)
 			free(task->prologue_callback_arg);
 
+		/* Does user want StarPU release prologue_pop_arg ? */
+		if (task->prologue_callback_pop_arg_free)
+			free(task->prologue_callback_pop_arg);
+
 		free(task);
 	}
 }

+ 24 - 16
src/core/topology.c

@@ -821,6 +821,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	int ncuda = config->conf->ncuda;
+	int nworker_per_cuda = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+
+	STARPU_ASSERT_MSG(nworker_per_cuda > 0, "STARPU_NWORKER_PER_CUDA has to be > 0");
 
 	if (ncuda != 0)
 	{
@@ -855,25 +858,30 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
 	{
-		int worker_idx = topology->nworkers + cudagpu;
-		config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 		int devid = _starpu_get_next_cuda_gpuid(config);
-		config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
-		config->workers[worker_idx].perf_arch.devid = cudagpu;
-		config->workers[worker_idx].perf_arch.ncore = 0;
-		config->workers[worker_idx].devid = devid;
-		config->workers[worker_idx].subworkerid = 0;
-		config->workers[worker_idx].worker_mask = STARPU_CUDA;
-		config->worker_mask |= STARPU_CUDA;
-
-		struct handle_entry *entry;
-		entry = (struct handle_entry *) malloc(sizeof(*entry));
-		STARPU_ASSERT(entry != NULL);
-		entry->gpuid = devid;
-		HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+		for (i = 0; i < nworker_per_cuda; i++)
+		{
+			int worker_idx = topology->nworkers + cudagpu * nworker_per_cuda + i;
+			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.type = STARPU_CUDA_WORKER;
+			config->workers[worker_idx].perf_arch.devid = devid;
+			// TODO: fix perfmodels etc.
+			//config->workers[worker_idx].perf_arch.ncore = nworker_per_cuda - 1;
+			config->workers[worker_idx].perf_arch.ncore = 0;
+			config->workers[worker_idx].devid = devid;
+			config->workers[worker_idx].subworkerid = i;
+			config->workers[worker_idx].worker_mask = STARPU_CUDA;
+			config->worker_mask |= STARPU_CUDA;
+
+			struct handle_entry *entry;
+			entry = (struct handle_entry *) malloc(sizeof(*entry));
+			STARPU_ASSERT(entry != NULL);
+			entry->gpuid = devid;
+			HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+		}
         }
 
-	topology->nworkers += topology->ncudagpus;
+	topology->nworkers += topology->ncudagpus * nworker_per_cuda;
 #endif
 
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)

+ 40 - 33
src/core/workers.c

@@ -326,6 +326,9 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
  * Runtime initialization methods
  */
 
+#ifdef STARPU_USE_CUDA
+static struct _starpu_worker_set cuda_worker_set[STARPU_MAXCUDADEVS];
+#endif
 #ifdef STARPU_USE_MIC
 static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 #endif
@@ -408,6 +411,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	starpu_task_list_init(&workerarg->local_tasks);
 	workerarg->current_task = NULL;
 	workerarg->set = NULL;
+	sem_init(&workerarg->parallel_code_sem, 0, 0);
 
 	/* if some codelet's termination cannot be handled directly :
 	 * for instance in the Gordon driver, Gordon tasks' callbacks
@@ -512,9 +516,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
-#ifdef STARPU_USE_MIC
+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 		unsigned devid = workerarg->devid;
-		unsigned subworkerid = workerarg->subworkerid;
 #endif
 
 		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
@@ -561,12 +564,22 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				driver.id.cuda_id = workerarg->devid;
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
+					/* We spawn only one thread per CUDA device,
+					 * which will control all CUDA workers of this
+					 * device. (by using a worker set). */
+					if (cuda_worker_set[devid].started)
+						goto worker_set_initialized;
+
+					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+					cuda_worker_set[devid].workers = workerarg;
+					cuda_worker_set[devid].set_is_initialized = 0;
+
 					STARPU_PTHREAD_CREATE_ON(
 						workerarg->name,
-						&workerarg->worker_thread,
+						&cuda_worker_set[devid].worker_thread,
 						NULL,
 						_starpu_cuda_worker,
-						workerarg,
+						&cuda_worker_set[devid],
 						worker+1);
 #ifdef STARPU_USE_FXT
 					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
@@ -574,6 +587,14 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
+					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
+					while (!cuda_worker_set[devid].set_is_initialized)
+						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
+									 &cuda_worker_set[devid].mutex);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
+					cuda_worker_set[devid].started = 1;
+		worker_set_initialized:
+					workerarg->set = &cuda_worker_set[devid];
 				}
 				else
 				{
@@ -608,8 +629,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #ifdef STARPU_USE_MIC
 			case STARPU_MIC_WORKER:
-				/* We use the Gordon approach for the MIC,
-				 * which consists in spawning only one thread
+				/* We spawn only one thread
 				 * per MIC device, which will control all MIC
 				 * workers of this device. (by using a worker set). */
 				if (mic_worker_set[devid].started)
@@ -644,16 +664,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 								  &mic_worker_set[devid].mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
 
+				mic_worker_set[devid].started = 1;
 		worker_set_initialized:
 				workerarg->set = &mic_worker_set[devid];
-				mic_worker_set[devid].started = 1;
-
-#ifdef STARPU_USE_FXT
-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-				while (!workerarg->worker_is_running)
-					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
-#endif
 
 				break;
 #endif /* STARPU_USE_MIC */
@@ -706,14 +719,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				cpu++;
 				break;
 			case STARPU_CUDA_WORKER:
-				driver.id.cuda_id = workerarg->devid;
-				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
-					break;
-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-				while (!workerarg->worker_is_initialized)
-					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				/* Already waited above */
 				break;
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 			case STARPU_OPENCL_WORKER:
@@ -1741,11 +1747,17 @@ int _starpu_worker_get_nsched_ctxs(int workerid)
 	return config.workers[workerid].nsched_ctxs;
 }
 
-static struct _starpu_worker *
+static void *
 _starpu_get_worker_from_driver(struct starpu_driver *d)
 {
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned workerid;
+
+#ifdef STARPU_USE_CUDA
+	if (d->type == STARPU_CUDA_WORKER)
+		return &cuda_worker_set[d->id.cuda_id];
+#endif
+
 	for (workerid = 0; workerid < nworkers; workerid++)
 	{
 		if (starpu_worker_get_type(workerid) == d->type)
@@ -1760,12 +1772,6 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 					return worker;
 				break;
 #endif
-#ifdef STARPU_USE_CUDA
-			case STARPU_CUDA_WORKER:
-				if (worker->devid == d->id.cuda_id)
-					return worker;
-				break;
-#endif
 #ifdef STARPU_USE_OPENCL
 			case STARPU_OPENCL_WORKER:
 			{
@@ -1795,7 +1801,7 @@ starpu_driver_run(struct starpu_driver *d)
 		return -EINVAL;
 	}
 
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1821,7 +1827,7 @@ int
 starpu_driver_init(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1846,7 +1852,7 @@ int
 starpu_driver_run_once(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1871,7 +1877,7 @@ int
 starpu_driver_deinit(struct starpu_driver *d)
 {
 	STARPU_ASSERT(d);
-	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
+	void *worker = _starpu_get_worker_from_driver(d);
 
 	switch (d->type)
 	{
@@ -1951,3 +1957,4 @@ unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 	}
 	return nsched_ctxs;
 }
+

+ 4 - 0
src/core/workers.h

@@ -121,6 +121,10 @@ LIST_TYPE(_starpu_worker,
 	/* id of the master worker */
 	int master;
 
+	/* semaphore that block appl thread until threads are ready 
+	   to exec the parallel code */
+	sem_t parallel_code_sem;
+
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */

+ 7 - 1
src/datawizard/coherency.c

@@ -194,7 +194,13 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 			/* GPUs not always allow direct remote access: if CUDA4
 			 * is enabled, we allow two CUDA devices to communicate. */
-			return kind == STARPU_CPU_RAM || kind == STARPU_CUDA_RAM;
+			return
+#if 0
+				/* CUDA does not seem very safe with concurrent
+				 * transfer queueing, avoid queueing from CPUs */
+				kind == STARPU_CPU_RAM ||
+#endif
+				kind == STARPU_CUDA_RAM;
 		}
 #else
 			/* Direct GPU-GPU transfers are not allowed in general */

+ 7 - 7
src/datawizard/copy_driver.c

@@ -160,10 +160,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		else
 		{
 			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_out_transfer_stream(src_node);
+			stream = starpu_cuda_get_local_out_transfer_stream();
 			if (copy_methods->cuda_to_ram_async)
 				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -195,11 +195,11 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		else
 		{
 			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
-			stream = starpu_cuda_get_in_transfer_stream(dst_node);
+			stream = starpu_cuda_get_local_in_transfer_stream();
 			if (copy_methods->ram_to_cuda_async)
 				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			else
@@ -228,7 +228,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		else
 		{
 			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
@@ -531,7 +531,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
 				cudaMemcpyDeviceToHost);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -539,7 +539,7 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) src + src_offset, src_node,
 				(void*) dst + dst_offset, dst_node,
 				size,
-				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
 				cudaMemcpyHostToDevice);
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):

+ 24 - 14
src/datawizard/data_request.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -394,14 +394,14 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
 	if (r->retval == -EAGAIN)
 	{
-		/* The request was successful, but could not be terminted
-		 * immediatly. We will handle the completion of the request
+		/* The request was successful, but could not be terminated
+		 * immediately. We will handle the completion of the request
 		 * asynchronously. The request is put in the list of "pending"
 		 * requests in the meantime. */
 		_starpu_spin_unlock(&handle->header_lock);
 
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
-		_starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
+		_starpu_data_request_list_push_back(data_requests_pending[r->handling_node], r);
 		data_requests_npending[r->handling_node]++;
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
 
@@ -415,13 +415,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *empty_list;
 	int ret = 0;
 
+	*pushed = 0;
+
 	/* Here helgrind would should that this is an un protected access.
 	 * We however don't care about missing an entry, we will get called
 	 * again sooner or later. */
@@ -479,6 +481,8 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 			_starpu_data_request_list_push_back(new_data_requests, r);
 			break;
 		}
+
+		(*pushed)++;
 	}
 
 	while (!_starpu_data_request_list_empty(local_list))
@@ -500,13 +504,15 @@ int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	return ret;
 }
 
-void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc)
+void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_prefetch_requests;
 	struct _starpu_data_request_list *empty_list;
 
+	*pushed = 0;
+
 	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
 		return;
 
@@ -563,6 +569,8 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 			}
 			break;
 		}
+
+		(*pushed)++;
 	}
 
 	while(!_starpu_data_request_list_empty(local_list))
@@ -590,7 +598,7 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 	_starpu_data_request_list_delete(local_list);
 }
 
-static void _handle_pending_node_data_requests(unsigned src_node, unsigned force)
+static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 {
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
@@ -599,12 +607,12 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 	unsigned taken, kept;
 
 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
-		return;
+		return 0;
 
 	empty_list = _starpu_data_request_list_new();
 	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]) && !force)
 		/* List is busy, do not bother with it */
-		return;
+		return 0;
 
 	/* for all entries of the list */
 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
@@ -613,7 +621,7 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 		/* there is no request */
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 		_starpu_data_request_list_delete(empty_list);
-		return;
+		return 0;
 	}
 	data_requests_pending[src_node] = empty_list;
 
@@ -680,16 +688,18 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
 	_starpu_data_request_list_delete(local_list);
 	_starpu_data_request_list_delete(new_data_requests_pending);
+
+	return taken - kept;
 }
 
-void _starpu_handle_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_pending_node_data_requests(unsigned src_node)
 {
-	_handle_pending_node_data_requests(src_node, 0);
+	return _handle_pending_node_data_requests(src_node, 0);
 }
 
-void _starpu_handle_all_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
 {
-	_handle_pending_node_data_requests(src_node, 1);
+	return _handle_pending_node_data_requests(src_node, 1);
 }
 
 int _starpu_check_that_no_data_request_exists(unsigned node)

+ 5 - 5
src/datawizard/data_request.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -110,11 +110,11 @@ void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node);
 /* returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
-void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc);
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
+void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
 
-void _starpu_handle_pending_node_data_requests(unsigned src_node);
-void _starpu_handle_all_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
 
 int _starpu_check_that_no_data_request_exists(unsigned node);
 

+ 26 - 7
src/datawizard/datawizard.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,8 +24,10 @@
 #include <msg/msg.h>
 #endif
 
-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
+int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
 {
+	int ret = 0;
+
 #if STARPU_DEVEL
 #warning FIXME
 #endif
@@ -35,11 +37,28 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 	STARPU_UYIELD();
 
 	/* in case some other driver requested data */
-	_starpu_handle_pending_node_data_requests(memory_node);
-	if (_starpu_handle_node_data_requests(memory_node, may_alloc) == 0)
-		/* We pushed all pending requests, we can afford pushing
-		 * prefetch requests */
-		_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
+	if (_starpu_handle_pending_node_data_requests(memory_node))
+		ret = 1;
+	if (push_requests)
+	{
+		unsigned pushed;
+		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
+		{
+			if (pushed)
+				ret = 1;
+			/* We pushed all pending requests, we can afford pushing
+			 * prefetch requests */
+			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
+		}
+		if (pushed)
+			ret = 1;
+	}
 	_starpu_execute_registered_progression_hooks();
+
+	return ret;
 }
 
+void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
+{
+	__starpu_datawizard_progress(memory_node, may_alloc, 1);
+}

+ 2 - 1
src/datawizard/datawizard.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,6 +33,7 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
+int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
 void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc);
 
 #endif // __DATAWIZARD_H__

+ 1 - 1
src/datawizard/filters.c

@@ -341,7 +341,6 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		_starpu_spin_lock(&child_handle->header_lock);
 
 		_starpu_data_unregister_ram_pointer(child_handle);
-		_starpu_data_free_interfaces(child_handle);
 
 		for (worker = 0; worker < nworkers; worker++)
 		{
@@ -424,6 +423,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	for (child = 0; child < root_handle->nchildren; child++)
 	{
 		starpu_data_handle_t child_handle = starpu_data_get_child(root_handle, child);
+		_starpu_data_free_interfaces(child_handle);
 		_starpu_spin_unlock(&child_handle->header_lock);
 		_starpu_spin_destroy(&child_handle->header_lock);
 

+ 1 - 1
src/datawizard/footprint.c

@@ -50,7 +50,7 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struc
 	{
 		footprint = model->footprint(task);
 	}
-	else if (model != NULL && 
+	else if (model != NULL && model->per_arch &&
 			model->per_arch[arch->type] != NULL &&
 			model->per_arch[arch->type][arch->devid] != NULL &&
 			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&

+ 1 - 1
src/datawizard/interfaces/data_interface.c

@@ -754,7 +754,6 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	size_t size = _starpu_data_get_size(handle);
 
 	_starpu_data_unregister_ram_pointer(handle);
-	_starpu_data_free_interfaces(handle);
 
 	/* Destroy the data now */
 	unsigned node;
@@ -774,6 +773,7 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 		if (local->allocated && local->automatically_allocated)
 			_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 	}
+	_starpu_data_free_interfaces(handle);
 
 	_starpu_memory_stats_free(handle);
 	_starpu_data_requester_list_delete(handle->req_list);

+ 39 - 15
src/datawizard/memalloc.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -259,6 +259,8 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 	if (mc->automatically_allocated &&
 		(!handle || replicate->refcnt == 0))
 	{
+		void *interface;
+
 		if (handle)
 			STARPU_ASSERT(replicate->allocated);
 
@@ -273,8 +275,14 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 		}
 #endif
 
+		if (handle)
+			interface = replicate->data_interface;
+		else
+			interface = mc->chunk_interface;
+		STARPU_ASSERT(interface);
+
 		_STARPU_TRACE_START_FREE(node, mc->size);
-		mc->ops->free_data_on_node(mc->chunk_interface, node);
+		mc->ops->free_data_on_node(interface, node);
 		_STARPU_TRACE_END_FREE(node);
 
 		if (handle)
@@ -301,7 +309,8 @@ static size_t do_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 		mc->size = _starpu_data_get_size(handle);
 	}
 
-	mc->replicate->mc=NULL;
+	if (mc->replicate)
+		mc->replicate->mc=NULL;
 
 	/* free the actual buffer */
 	size = free_memory_on_node(mc, node);
@@ -309,7 +318,6 @@ static size_t do_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 	/* remove the mem_chunk from the list */
 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
-	free(mc->chunk_interface);
 	_starpu_mem_chunk_delete(mc);
 
 	return size;
@@ -351,7 +359,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
 		if (mc->replicate->refcnt == 0)
 		{
-			/* Note taht there is no need to transfer any data or
+			/* Note that there is no need to transfer any data or
 			 * to update the status in terms of MSI protocol
 			 * because this memchunk is associated to a replicate
 			 * in "relaxed coherency" mode. */
@@ -409,22 +417,36 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
  * therefore not in the cache. */
 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
 {
+	void *interface;
+
 	/* we found an appropriate mem chunk: so we get it out
 	 * of the "to free" list, and reassign it to the new
 	 * piece of data */
 
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
-	old_replicate->allocated = 0;
-	old_replicate->automatically_allocated = 0;
-	old_replicate->initialized = 0;
+	if (old_replicate)
+	{
+		old_replicate->allocated = 0;
+		old_replicate->automatically_allocated = 0;
+		old_replicate->initialized = 0;
+		interface = old_replicate->data_interface;
+	}
+	else
+		interface = mc->chunk_interface;
 
 	new_replicate->allocated = 1;
 	new_replicate->automatically_allocated = 1;
 	new_replicate->initialized = 0;
 
 	STARPU_ASSERT(new_replicate->data_interface);
-	STARPU_ASSERT(mc->chunk_interface);
-	memcpy(new_replicate->data_interface, mc->chunk_interface, mc->size_interface);
+	STARPU_ASSERT(interface);
+	memcpy(new_replicate->data_interface, interface, mc->size_interface);
+
+	if (!old_replicate)
+	{
+		free(mc->chunk_interface);
+		mc->chunk_interface = NULL;
+	}
 
 	mc->data = new_replicate->handle;
 	/* mc->ops, mc->footprint and mc->interface should be
@@ -717,12 +739,8 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 	mc->relaxed_coherency = replicate->relaxed_coherency;
 	mc->replicate = replicate;
 	mc->replicate->mc = mc;
-
-	/* Save a copy of the interface */
-	mc->chunk_interface = malloc(interface_size);
+	mc->chunk_interface = NULL;
 	mc->size_interface = interface_size;
-	STARPU_ASSERT(mc->chunk_interface);
-	memcpy(mc->chunk_interface, replicate->data_interface, interface_size);
 
 	return mc;
 }
@@ -761,8 +779,14 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 	 * by freeing this.  */
 	mc->size = size;
 
+	/* Also keep the interface parameters and pointers, for later reuse
+	 * while detached, or freed */
+	mc->chunk_interface = malloc(mc->size_interface);
+	memcpy(mc->chunk_interface, replicate->data_interface, mc->size_interface);
+
 	/* This memchunk doesn't have to do with the data any more. */
 	replicate->mc = NULL;
+	mc->replicate = NULL;
 	replicate->allocated = 0;
 	replicate->automatically_allocated = 0;
 

+ 8 - 6
src/datawizard/memalloc.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,11 +33,13 @@ LIST_TYPE(_starpu_mem_chunk,
 
 	uint32_t footprint;
 
-	/* The footprint of the data is not sufficient to determine whether two
-	 * pieces of data have the same layout (there could be collision in the
-	 * hash function ...) so we still keep a copy of the actual layout (ie.
-	 * the data interface) to stay on the safe side. We make a copy of
-	 * because when a data is deleted, the memory chunk remains.
+	/*
+	 * When re-using a memchunk, the footprint of the data is not
+	 * sufficient to determine whether two pieces of data have the same
+	 * layout (there could be collision in the hash function ...) so we
+	 * still keep a copy of the actual layout (ie. the data interface) to
+	 * stay on the safe side while the memchunk is detached from an actual
+	 * data.
 	 */
 	struct starpu_data_interface_ops *ops;
 	void *chunk_interface;

+ 3 - 3
src/datawizard/memory_nodes.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,9 +69,9 @@ unsigned _starpu_memory_node_get_local_key(void)
 	memory_node = (unsigned *) STARPU_PTHREAD_GETSPECIFIC(memory_node_key);
 
 	/* in case this is called by the programmer, we assume the RAM node
-	   is the appropriate memory node ... so we return 0 XXX */
+	   is the appropriate memory node ... XXX */
 	if (STARPU_UNLIKELY(!memory_node))
-		return 0;
+		return STARPU_MAIN_RAM;
 
 	return *memory_node;
 }

+ 9 - 2
src/debug/traces/starpu_fxt.c

@@ -164,7 +164,9 @@ static void register_worker_id(unsigned long tid, int workerid)
 	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
 
 	/* only register a thread once */
-	STARPU_ASSERT(entry == NULL);
+	//STARPU_ASSERT(entry == NULL);
+	if (entry)
+		return;
 
 	entry = malloc(sizeof(*entry));
 	entry->tid = tid;
@@ -423,12 +425,17 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	char *prefix = options->file_prefix;
+	int worker;
 
 	if (out_paje_file)
 		worker_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
 
+	if (ev->nb_params < 2)
+		worker = find_worker_id(ev->param[0]);
+	else
+		worker = ev->param[1];
+
 	/* Initilize the accumulated time counters */
-	int worker = find_worker_id(ev->param[0]);
 	last_activity_flush_timestamp[worker] = get_event_time_stamp(ev, options);
 	accumulated_sleep_time[worker] = 0.0;
 	accumulated_exec_time[worker] = 0.0;

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -176,7 +176,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
 	cpu_worker->status = STATUS_UNKNOWN;
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(cpu_worker->workerid);
 
 	/* tell the main thread that we are ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&cpu_worker->mutex);

+ 185 - 77
src/drivers/cuda/driver_cuda.c

@@ -37,15 +37,16 @@
 #endif
 
 /* the number of CUDA devices */
-static int ncudagpus;
+static unsigned ncudagpus;
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
-static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
-static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
+static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
 static cudaStream_t peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
+static cudaEvent_t task_events[STARPU_NMAXWORKERS];
 #endif /* STARPU_USE_CUDA */
 
 void
@@ -115,18 +116,18 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 }
 
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
 
-	return in_transfer_streams[devid];
+	return in_transfer_streams[worker];
 }
 
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node)
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
 {
-	int devid = _starpu_memory_node_get_devid(node);
+	int worker = starpu_worker_get_id();
 
-	return out_transfer_streams[devid];
+	return out_transfer_streams[worker];
 }
 
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
@@ -195,11 +196,11 @@ done:
 }
 
 #ifndef STARPU_SIMGRID
-static void init_context(unsigned devid)
+static void init_context(struct _starpu_worker_set *worker_set, unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
-	int i;
+	unsigned i;
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
@@ -250,19 +251,26 @@ static void init_context(unsigned devid)
 	}
 #endif
 
-	workerid = starpu_worker_get_id();
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		workerid = worker_set->workers[i].workerid;
 
-	cures = cudaStreamCreate(&streams[workerid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaEventCreateWithFlags(&task_events[workerid], cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&in_transfer_streams[devid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaStreamCreate(&out_transfer_streams[devid]);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+		cures = cudaStreamCreate(&in_transfer_streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		cures = cudaStreamCreate(&out_transfer_streams[workerid]);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
 
 	for (i = 0; i < ncudagpus; i++)
 	{
@@ -272,15 +280,23 @@ static void init_context(unsigned devid)
 	}
 }
 
-static void deinit_context(int workerid)
+static void deinit_context(struct _starpu_worker_set *worker_set)
 {
 	cudaError_t cures;
-	int devid = starpu_worker_get_devid(workerid);
-	int i;
+	unsigned i;
+	int workerid, devid;
+
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		workerid = worker_set->workers[i].workerid;
+		devid = starpu_worker_get_devid(workerid);
+
+		cudaEventDestroy(task_events[workerid]);
+		cudaStreamDestroy(streams[workerid]);
+		cudaStreamDestroy(in_transfer_streams[workerid]);
+		cudaStreamDestroy(out_transfer_streams[workerid]);
+	}
 
-	cudaStreamDestroy(streams[workerid]);
-	cudaStreamDestroy(in_transfer_streams[devid]);
-	cudaStreamDestroy(out_transfer_streams[devid]);
 	for (i = 0; i < ncudagpus; i++)
 		cudaStreamDestroy(peer_transfer_streams[i][devid]);
 
@@ -327,21 +343,21 @@ void _starpu_init_cuda(void)
 	STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
 }
 
-static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
 
-	struct timespec codelet_start, codelet_end;
-
 	int profiling = starpu_profiling_status_get();
 
 	STARPU_ASSERT(task);
 	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
+	_starpu_set_current_task(task);
+
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
 	{
@@ -351,7 +367,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
@@ -367,29 +383,50 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 	}
 
+	return 0;
+}
+
+static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	struct timespec codelet_end;
+
+	int profiling = starpu_profiling_status_get();
+
+	_starpu_set_current_task(NULL);
+	args->current_task = NULL;
+
 	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
-	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &codelet_start, &codelet_end, profiling);
+	_starpu_driver_update_job_feedback(j, args, &args->perf_arch, &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
-	return 0;
+	_starpu_handle_job_termination(j);
 }
 
 /* XXX Should this be merged with _starpu_init_cuda ? */
-int _starpu_cuda_driver_init(struct _starpu_worker *args)
+int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 {
+	struct _starpu_worker *args = &worker_set->workers[0];
 	unsigned devid = args->devid;
+	unsigned i;
 
 	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
 
+#ifdef STARPU_USE_FXT
+	unsigned memnode = args->memory_node;
+	for (i = 1; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, worker->workerid, devid, memnode);
+	}
+#endif
+
 #ifndef STARPU_SIMGRID
-	init_context(devid);
+	init_context(worker_set, devid);
 #endif
 
 	_starpu_cuda_limit_gpu_mem_if_needed(devid);
@@ -424,7 +461,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker *args)
 	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", devid);
 	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(worker->workerid);
+	}
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
@@ -432,64 +473,132 @@ int _starpu_cuda_driver_init(struct _starpu_worker *args)
 	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
+	/* tell the main thread that this one is ready */
+	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+	worker_set->set_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
+
 	return 0;
 }
 
-int _starpu_cuda_driver_run_once(struct _starpu_worker *args)
+int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 {
-	unsigned memnode = args->memory_node;
-	int workerid = args->workerid;
-
-	_STARPU_TRACE_START_PROGRESS(memnode);
-	_starpu_datawizard_progress(memnode, 1);
-	_STARPU_TRACE_END_PROGRESS(memnode);
+	struct _starpu_worker *worker0 = &worker_set->workers[0];
+	unsigned memnode = worker0->memory_node;
+	struct starpu_task *tasks[worker_set->nworkers], *task;
+	struct _starpu_job *j;
+	int i, res, idle;
+
+	/* First poll for completed jobs */
+	idle = 0;
+	for (i = 0; i < (int) worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *args = &worker_set->workers[i];
+		int workerid = args->workerid;
 
-	struct starpu_task *task;
-	struct _starpu_job *j = NULL;
+		task = args->current_task;
 
-	task = _starpu_get_worker_task(args, workerid, memnode);
+		if (!task)
+		{
+			idle++;
+			continue;
+		}
 
-	if (!task)
-		return 0;
+		/* On-going asynchronous task, check for its termination first */
+		cudaError_t cures = cudaEventQuery(task_events[workerid]);
 
-	j = _starpu_get_job_associated_to_task(task);
+		if (cures != cudaSuccess)
+		{
+			STARPU_ASSERT(cures == cudaErrorNotReady);
+			idle++;
+		}
+		else
+		{
+			/* Asynchronous task completed! */
+			_starpu_set_local_worker_key(args);
+			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+		}
+	}
 
-	/* can CUDA do that task ? */
-	if (!_STARPU_CUDA_MAY_PERFORM(j))
+	if (!idle)
 	{
-		/* this is neither a cuda or a cublas task */
-		_starpu_push_task_to_workers(task);
+		/* Nothing ready yet, no better thing to do than waiting */
+		__starpu_datawizard_progress(memnode, 1, 0);
 		return 0;
 	}
 
-	_starpu_set_current_task(task);
-	args->current_task = j->task;
+	/* Something done, make some progress */
+	__starpu_datawizard_progress(memnode, 1, 1);
 
-	int res = execute_job_on_cuda(j, args);
+	/* And pull tasks */
+	res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers);
 
-	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	if (!res)
+		return 0;
 
-	if (res)
+	for (i = 0; i < (int) worker_set->nworkers; i++)
 	{
-		switch (res)
+		struct _starpu_worker *args = &worker_set->workers[i];
+		int workerid = args->workerid;
+
+		task = tasks[i];
+		if (!task)
+			continue;
+
+		_starpu_set_local_worker_key(args);
+
+		j = _starpu_get_job_associated_to_task(task);
+
+		/* can CUDA do that task ? */
+		if (!_STARPU_CUDA_MAY_PERFORM(j))
 		{
-			case -EAGAIN:
-				_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
-				_starpu_push_task_to_workers(task);
-				STARPU_ABORT();
-			default:
-				STARPU_ABORT();
+			/* this is neither a cuda or a cublas task */
+			_starpu_push_task_to_workers(task);
+			continue;
 		}
-	}
 
-	_starpu_handle_job_termination(j);
+		_STARPU_TRACE_END_PROGRESS(memnode);
+		res = start_job_on_cuda(j, args);
+
+		if (res)
+		{
+			switch (res)
+			{
+				case -EAGAIN:
+					_STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
+					_starpu_push_task_to_workers(task);
+					STARPU_ABORT();
+				default:
+					STARPU_ABORT();
+			}
+		}
+
+#ifndef STARPU_SIMGRID
+		if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+		{
+			/* Record event to synchronize with task termination later */
+			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
+		}
+		else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+		/* Synchronous execution */
+		{
+			finish_job_on_cuda(j, args);
+		}
+		_STARPU_TRACE_START_PROGRESS(memnode);
+	}
 
 	return 0;
 }
 
-int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
+int _starpu_cuda_driver_deinit(struct _starpu_worker_set *arg)
 {
+	struct _starpu_worker *args = &arg->workers[0];
 	unsigned memnode = args->memory_node;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
@@ -503,7 +612,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
 	_starpu_malloc_shutdown(memnode);
 
 #ifndef STARPU_SIMGRID
-	deinit_context(args->workerid);
+	deinit_context(arg);
 #endif
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
@@ -513,11 +622,13 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker *args)
 
 void *_starpu_cuda_worker(void *arg)
 {
-	struct _starpu_worker* args = arg;
+	struct _starpu_worker_set* args = arg;
 
 	_starpu_cuda_driver_init(args);
+	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 		_starpu_cuda_driver_run_once(args);
+	_STARPU_TRACE_END_PROGRESS(memnode);
 	_starpu_cuda_driver_deinit(args);
 
 	return NULL;
@@ -636,11 +747,8 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 }
 #endif /* STARPU_USE_CUDA */
 
-int _starpu_run_cuda(struct _starpu_worker *workerarg)
+int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
 {
-	workerarg->set = NULL;
-	workerarg->worker_is_initialized = 0;
-
 	/* Let's go ! */
 	_starpu_cuda_worker(workerarg);
 

+ 7 - 7
src/drivers/cuda/driver_cuda.h

@@ -48,15 +48,15 @@ void *_starpu_cuda_worker(void *);
 #  define _starpu_cuda_discover_devices(config) ((void) config)
 #endif
 #ifdef STARPU_USE_CUDA
-cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned node);
-cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned node);
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 
-struct _starpu_worker;
-int _starpu_run_cuda(struct _starpu_worker *);
-int _starpu_cuda_driver_init(struct _starpu_worker *);
-int _starpu_cuda_driver_run_once(struct _starpu_worker *);
-int _starpu_cuda_driver_deinit(struct _starpu_worker *);
+struct _starpu_worker_set;
+int _starpu_run_cuda(struct _starpu_worker_set *);
+int _starpu_cuda_driver_init(struct _starpu_worker_set *);
+int _starpu_cuda_driver_run_once(struct _starpu_worker_set *);
+int _starpu_cuda_driver_deinit(struct _starpu_worker_set *);
 #endif
 
 #endif //  __DRIVER_CUDA_H__

+ 1 - 0
src/drivers/driver_common/driver_common.c

@@ -201,6 +201,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 	if(args->parallel_sect)
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&args->parallel_sect_mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 		_starpu_sched_ctx_signal_worker_blocked(args->workerid);
 		STARPU_PTHREAD_COND_WAIT(&args->parallel_sect_cond, &args->parallel_sect_mutex);
 		starpu_sched_ctx_bind_current_thread_to_cpuid(args->bindid);

+ 10 - 1
src/drivers/mic/driver_mic_source.c

@@ -518,6 +518,11 @@ void *_starpu_mic_src_worker(void *arg)
 	/* unsigned memnode = baseworker->memory_node; */
 
 	_starpu_worker_start(baseworker, _STARPU_FUT_MIC_KEY);
+	for (i = 1; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(workerid);
+	}
 
 	// Current task for a thread managing a worker set has no sense.
 	_starpu_set_current_task(NULL);
@@ -530,7 +535,11 @@ void *_starpu_mic_src_worker(void *arg)
 
 	baseworker->status = STATUS_UNKNOWN;
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	for (i = 0; i < worker_set->nworkers; i++)
+	{
+		struct _starpu_worker *worker = &worker_set->workers[i];
+		_STARPU_TRACE_WORKER_INIT_END(workerid);
+	}
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);

+ 79 - 31
src/drivers/opencl/driver_opencl.c

@@ -50,6 +50,7 @@ static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
 static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
+static cl_event task_events[STARPU_MAXOPENCLDEVS];
 #endif
 
 void
@@ -165,7 +166,7 @@ cl_int _starpu_opencl_init_context(int devid)
         err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
-        props &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
         in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
         if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
         out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
@@ -561,11 +562,13 @@ void _starpu_opencl_init(void)
 #ifndef STARPU_SIMGRID
 static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
 #endif
-static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args);
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args);
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args);
 
 int _starpu_opencl_driver_init(struct _starpu_worker *args)
 {
 	int devid = args->devid;
+	int workerid = args->workerid;
 
 	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
 
@@ -596,7 +599,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *args)
 
 	_STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(workerid);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
@@ -616,13 +619,36 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	struct starpu_task *task;
 	int res;
 
-	_STARPU_TRACE_START_PROGRESS(memnode);
-	_starpu_datawizard_progress(memnode, 1);
+	task = starpu_task_get_current();
+
+	if (task)
+	{
+		cl_int status;
+		size_t size;
+		int err;
+		/* On-going asynchronous task, check for its termination first */
+
+		err = clGetEventInfo(task_events[args->devid], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
+		STARPU_ASSERT(size == sizeof(cl_int));
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+
+		if (status != CL_COMPLETE)
+		{
+			/* Not ready yet, no better thing to do than waiting */
+			__starpu_datawizard_progress(memnode, 1, 0);
+			return 0;
+		}
+
+		/* Asynchronous task completed! */
+		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
+	}
+
+	__starpu_datawizard_progress(memnode, 1, 1);
+
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	task = _starpu_get_worker_task(args, workerid, memnode);
 
-
 	if (task == NULL)
 		return 0;
 
@@ -636,13 +662,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		return 0;
 	}
 
-	_starpu_set_current_task(j->task);
-	args->current_task = j->task;
-
-	res = _starpu_opencl_execute_job(j, args);
-
-	_starpu_set_current_task(NULL);
-	args->current_task = NULL;
+	res = _starpu_opencl_start_job(j, args);
 
 	if (res)
 	{
@@ -658,7 +678,32 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		}
 	}
 
-	_starpu_handle_job_termination(j);
+#ifndef STARPU_SIMGRID
+	if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
+	{
+		/* Record event to synchronize with task termination later */
+		int err;
+		cl_command_queue queue;
+		starpu_opencl_get_queue(args->devid, &queue);
+#ifdef CL_VERSION_1_2
+		err = clEnqueueMarkerWithWaitList(queue, 0, NULL, &task_events[args->devid]);
+#else
+		err = clEnqueueMarker(queue, &task_events[args->devid]);
+#endif
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	else
+#else
+#ifdef STARPU_DEVEL
+#warning No CUDA asynchronous execution with simgrid yet.
+#endif
+#endif
+	/* Synchronous execution */
+	{
+		_starpu_opencl_stop_job(j, args);
+	}
+	_STARPU_TRACE_START_PROGRESS(memnode);
+
 	return 0;
 }
 
@@ -692,9 +737,11 @@ void *_starpu_opencl_worker(void *arg)
 	struct _starpu_worker* args = arg;
 
 	_starpu_opencl_driver_init(args);
+	_STARPU_TRACE_START_PROGRESS(memnode);
 	while (_starpu_machine_is_running())
 		_starpu_opencl_driver_run_once(args);
 	_starpu_opencl_driver_deinit(args);
+	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	return NULL;
 }
@@ -746,7 +793,7 @@ cl_device_type _starpu_opencl_get_device_type(int devid)
 }
 #endif /* STARPU_USE_OPENCL */
 
-static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *args)
 {
 	int ret;
 
@@ -754,12 +801,14 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 	struct starpu_task *task = j->task;
 
 	int profiling = starpu_profiling_status_get();
-	struct timespec codelet_start, codelet_end;
 
 	STARPU_ASSERT(task);
 	struct starpu_codelet *cl = task->cl;
 	STARPU_ASSERT(cl);
 
+	_starpu_set_current_task(j->task);
+	args->current_task = j->task;
+
 	ret = _starpu_fetch_task_input(j);
 	if (ret != 0)
 	{
@@ -769,7 +818,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
@@ -780,12 +829,6 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-		{
-			cl_command_queue queue;
-			starpu_opencl_get_queue(args->devid, &queue);
-			clFinish(queue);
-		}
 	    #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
 	      #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
 		#define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
@@ -800,23 +843,28 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 		_starpu_simgrid_execute_job(j, &args->perf_arch, length);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
-		if (cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
-		{
-			cl_command_queue queue;
-			starpu_opencl_get_queue(args->devid, &queue);
-			clFinish(queue);
-		}
 #endif
 	}
+	return 0;
+}
+
+static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	struct timespec codelet_end;
+	int profiling = starpu_profiling_status_get();
+
+	_starpu_set_current_task(NULL);
+	args->current_task = NULL;
 
 	_starpu_driver_end_job(args, j, &args->perf_arch, &codelet_end, 0, profiling);
 
 	_starpu_driver_update_job_feedback(j, args, &args->perf_arch,
-					   &codelet_start, &codelet_end, profiling);
+					   &j->cl_start, &codelet_end, profiling);
 
 	_starpu_push_task_output(j);
 
-	return EXIT_SUCCESS;
+	_starpu_handle_job_termination(j);
+
 }
 
 #ifdef STARPU_USE_OPENCL

+ 1 - 1
src/drivers/scc/driver_scc_source.c

@@ -303,7 +303,7 @@ void *_starpu_scc_src_worker(void *arg)
 		snprintf(worker->name, sizeof(worker->name), "MIC %d core %u", devid, i);
 	}
 
-	_STARPU_TRACE_WORKER_INIT_END;
+	_STARPU_TRACE_WORKER_INIT_END(workerid);
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);

+ 36 - 1
src/util/starpu_task_insert_utils.c

@@ -86,7 +86,14 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		{
 			(void)va_arg(varg_list, void *);
 		}
-
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+		{
+			(void)va_arg(varg_list, _starpu_callback_func_t);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+		{
+			(void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			(void)va_arg(varg_list, void *);
@@ -194,6 +201,14 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list
 		{
 			(void)va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+		{
+			va_arg(varg_list, _starpu_callback_func_t);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+		{
+			(void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			(void)va_arg(varg_list, int);
@@ -261,6 +276,11 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 
 	prologue_cl_arg_wrapper->callback_func = NULL;
 
+	struct task_insert_cb_wrapper *prologue_pop_cl_arg_wrapper = (struct task_insert_cb_wrapper *) malloc(sizeof(struct task_insert_cb_wrapper));
+	STARPU_ASSERT(prologue_pop_cl_arg_wrapper);
+
+	prologue_pop_cl_arg_wrapper->callback_func = NULL;
+
 	while((arg_type = va_arg(varg_list, int)) != 0)
 	{
 		if (arg_type & STARPU_R || arg_type & STARPU_W || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
@@ -341,6 +361,17 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 			void *callback_arg = va_arg(varg_list, void *);
 			prologue_cl_arg_wrapper->callback_arg = callback_arg;
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+		{
+			void (*callback_func)(void *);
+			callback_func = va_arg(varg_list, _starpu_callback_func_t);
+			prologue_pop_cl_arg_wrapper->callback_func = callback_func;
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+		{
+			void *callback_arg = va_arg(varg_list, void *);
+			prologue_pop_cl_arg_wrapper->callback_arg = callback_arg;
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			/* Followed by a priority level */
@@ -406,4 +437,8 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 	(*task)->prologue_callback_func = starpu_task_insert_callback_wrapper;
 	(*task)->prologue_callback_arg = prologue_cl_arg_wrapper;
 	(*task)->prologue_callback_arg_free = 1;
+
+	(*task)->prologue_callback_pop_func = starpu_task_insert_callback_wrapper;
+	(*task)->prologue_callback_pop_arg = prologue_pop_cl_arg_wrapper;
+	(*task)->prologue_callback_pop_arg_free = 1;
 }

+ 1 - 1
starpu-1.0-mic.pc.in

@@ -28,7 +28,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpu-1.0.pc.in

@@ -28,7 +28,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpu-1.1.pc.in

@@ -28,7 +28,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpu-1.2.pc.in

@@ -28,7 +28,7 @@ includedir=@includedir@
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -D_XOPEN_SOURCE=600 @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@
 Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@

+ 8 - 0
tests/Makefile.am

@@ -220,6 +220,7 @@ noinst_PROGRAMS =				\
 	microbenchs/local_pingpong		\
 	microbenchs/matrix_as_vector		\
 	overlap/overlap				\
+	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
@@ -609,6 +610,13 @@ datawizard_interfaces_void_void_interface_SOURCES=\
 	datawizard/interfaces/void/void_interface.c
 
 
+overlap_gpu_concurrency_SOURCES=\
+	overlap/gpu_concurrency.c
+if STARPU_USE_CUDA
+overlap_gpu_concurrency_SOURCES+=\
+	overlap/long_kernel.cu
+endif
+
 perfmodels_regression_based_SOURCES=\
 	perfmodels/regression_based.c
 

+ 3 - 1
tests/datawizard/acquire_release.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,9 +44,11 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs_name = {"increment_cpu", NULL},
 	.nbuffers = 1

+ 2 - 0
tests/datawizard/acquire_release2.c

@@ -44,9 +44,11 @@ static struct starpu_codelet increment_cl =
 	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs_name = {"increment_cpu", NULL},
 	.nbuffers = 1

+ 0 - 2
tests/datawizard/acquire_release_cuda.cu

@@ -26,6 +26,4 @@ extern "C" void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_a
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 6
tests/datawizard/acquire_release_opencl.c

@@ -43,14 +43,9 @@ void increment_opencl(void *buffers[], void *args)
 		size_t global=1;
 		size_t local=1;
 
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
 	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
-
 	starpu_opencl_release_kernel(kernel);
 }

+ 1 - 3
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,4 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 3
tests/datawizard/data_invalidation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -45,7 +45,6 @@ static void cuda_memset_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_ar
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(buf, 42, length, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -74,7 +73,6 @@ static void opencl_memset_codelet(void *buffers[], void *args)
 			     0,      /* num_events_in_wait_list */
 			     NULL,   /* event_wait_list */
 			     NULL    /* event */);
-	clFinish(queue);
 }
 #endif /* !STARPU_USE_OPENCL */
 
@@ -93,9 +91,11 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_memset_codelet, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_memset_codelet, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs_name = {"cpu_memset_codelet", NULL},
 	.nbuffers = 1,

+ 2 - 2
tests/datawizard/handle_to_pointer.c

@@ -50,7 +50,6 @@ static void cuda_task(void **buffers, void *args)
 	{
 		cudaMemcpyAsync(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	}
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -78,7 +77,6 @@ static void opencl_task(void *buffers[], void *args)
 				NULL,           /* event_wait_list */
 				NULL            /* event */);
 	}
-	clFinish(queue);
 }
 #endif
 
@@ -87,9 +85,11 @@ static struct starpu_codelet cl =
 	.cpu_funcs = {cpu_task, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_task, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_task, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs_name = {"cpu_task", NULL},
 	.nbuffers = 1,

+ 7 - 7
tests/datawizard/increment_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,7 +44,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -56,7 +55,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -80,7 +78,6 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 	h_dst += h_src;
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
-	clFinish(queue);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
@@ -94,7 +91,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 	starpu_opencl_get_current_queue(&queue);
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
-	clFinish(queue);
 }
 #endif
 
@@ -119,9 +115,11 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.cpu_funcs_name = {"redux_cpu_kernel", NULL},
@@ -133,9 +131,11 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.cpu_funcs_name = {"neutral_cpu_kernel", NULL},
@@ -162,7 +162,6 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
-	clFinish(queue);
 }
 #endif
 
@@ -182,7 +181,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -198,9 +196,11 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {increment_cpu_kernel, NULL},
 	.cpu_funcs_name = {"increment_cpu_kernel", NULL},

+ 7 - 7
tests/datawizard/increment_redux_lazy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,6 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	host_dst += host_src;
 
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -50,7 +49,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
 	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -72,7 +70,6 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 	h_dst += h_src;
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
-	clFinish(queue);
 }
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
@@ -84,7 +81,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 	starpu_opencl_get_current_queue(&queue);
 
 	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
-	clFinish(queue);
 }
 #endif
 
@@ -105,9 +101,11 @@ static struct starpu_codelet redux_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {redux_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.cpu_funcs_name = {"redux_cpu_kernel", NULL},
@@ -119,9 +117,11 @@ static struct starpu_codelet neutral_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {neutral_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {neutral_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.cpu_funcs_name = {"neutral_cpu_kernel", NULL},
@@ -146,7 +146,6 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
-	clFinish(queue);
 }
 #endif
 
@@ -164,7 +163,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	host_token++;
 
 	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -178,9 +176,11 @@ static struct starpu_codelet increment_cl =
 {
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda_kernel, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {increment_opencl_kernel, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.cpu_funcs = {increment_cpu_kernel, NULL},
 	.cpu_funcs_name = {"increment_cpu_kernel", NULL},

+ 0 - 0
tests/datawizard/increment_redux_v2.c


Некоторые файлы не были показаны из-за большого количества измененных файлов