Browse Source

merge trunk

Nathalie Furmento 12 years ago
parent
commit
483f8e2979
100 changed files with 1730 additions and 906 deletions
  1. 12 2
      ChangeLog
  2. 11 3
      configure.ac
  3. 20 20
      doc/doxygen/chapters/advanced_examples.doxy
  4. 17 5
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  5. 21 10
      doc/doxygen/chapters/api/insert_task.doxy
  6. 8 4
      doc/doxygen/chapters/api/mpi.doxy
  7. 7 0
      doc/doxygen/chapters/configure_options.doxy
  8. 6 6
      doc/doxygen/chapters/mpi_support.doxy
  9. 5 5
      doc/doxygen/chapters/scheduling_context_hypervisor.doxy
  10. 1 1
      doc/doxygen/chapters/scheduling_contexts.doxy
  11. 4 4
      examples/basic_examples/dynamic_handles.c
  12. 1 1
      examples/binary/binary.c
  13. 3 1
      examples/callback/prologue.c
  14. 12 12
      examples/cg/cg_kernels.c
  15. 7 7
      examples/cholesky/cholesky_implicit.c
  16. 1 1
      examples/cpp/incrementer_cpp.cpp
  17. 14 16
      examples/interface/complex.c
  18. 3 1
      examples/interface/complex_codelet.h
  19. 3 3
      examples/mandelbrot/mandelbrot.c
  20. 8 8
      examples/pipeline/pipeline.c
  21. 6 6
      gcc-plugin/src/tasks.c
  22. 4 4
      gcc-plugin/tests/base.c
  23. 2 2
      gcc-plugin/tests/lib-user.c
  24. 10 10
      gcc-plugin/tests/mocks.h
  25. 3 3
      gcc-plugin/tests/opencl.c
  26. 2 2
      gcc-plugin/tests/output-pointer.c
  27. 4 4
      gcc-plugin/tests/pointers.c
  28. 4 0
      include/starpu_task.h
  29. 3 1
      include/starpu_task_util.h
  30. 0 8
      include/starpu_util.h
  31. 7 5
      mpi/examples/complex/mpi_complex.c
  32. 2 2
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  33. 18 18
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  34. 4 2
      mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
  35. 3 1
      mpi/examples/matrix_decomposition/mpi_cholesky_models.h
  36. 42 42
      mpi/examples/mpi_lu/pxlu_implicit.c
  37. 11 9
      mpi/examples/stencil/stencil5.c
  38. 2 1
      mpi/include/starpu_mpi.h
  39. 2 2
      mpi/src/Makefile.am
  40. 267 121
      mpi/src/starpu_mpi.c
  41. 1 1
      mpi/src/starpu_mpi_private.h
  42. 111 33
      mpi/src/starpu_mpi_insert_task.c
  43. 0 0
      mpi/src/starpu_mpi_task_insert.h
  44. 10 2
      mpi/tests/Makefile.am
  45. 76 0
      mpi/tests/gather.c
  46. 98 0
      mpi/tests/gather2.c
  47. 9 9
      mpi/tests/insert_task.c
  48. 3 3
      mpi/tests/insert_task_block.c
  49. 5 5
      mpi/tests/insert_task_cache.c
  50. 10 10
      mpi/tests/insert_task_owner.c
  51. 2 2
      mpi/tests/insert_task_owner2.c
  52. 2 2
      mpi/tests/insert_task_owner_data.c
  53. 3 3
      mpi/tests/mpi_earlyrecv.c
  54. 1 1
      mpi/tests/mpi_earlyrecv2.c
  55. 8 6
      mpi/tests/mpi_reduction.c
  56. 7 10
      mpi/tests/mpi_reduction_kernels.c
  57. 2 2
      mpi/tests/mpi_redux.c
  58. 7 6
      mpi/tests/mpi_scatter_gather.c
  59. 6 5
      mpi/tests/user_defined_datatype.c
  60. 14 14
      sc_hypervisor/examples/cholesky/cholesky_implicit.c
  61. 2 0
      sc_hypervisor/include/sc_hypervisor.h
  62. 1 1
      sc_hypervisor/include/sc_hypervisor_lp.h
  63. 1 0
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  64. 72 9
      sc_hypervisor/src/policies_utils/lp_programs.c
  65. 5 2
      sc_hypervisor/src/policies_utils/lp_tools.c
  66. 1 0
      sc_hypervisor/src/policies_utils/policy_tools.c
  67. 1 1
      sc_hypervisor/src/sc_config.c
  68. 149 2
      sc_hypervisor/src/sc_hypervisor.c
  69. 3 3
      src/Makefile.am
  70. 157 91
      src/common/fxt.h
  71. 8 34
      src/common/starpu_spinlock.h
  72. 36 70
      src/common/thread.c
  73. 22 29
      src/common/utils.h
  74. 8 1
      src/core/dependencies/implicit_data_deps.c
  75. 1 1
      src/core/dependencies/task_deps.c
  76. 5 4
      src/core/disk_ops/disk_stdio.c
  77. 2 1
      src/core/perfmodel/perfmodel.c
  78. 1 0
      src/core/perfmodel/perfmodel_bus.c
  79. 27 16
      src/core/perfmodel/perfmodel_history.c
  80. 35 11
      src/core/sched_policy.c
  81. 8 0
      src/core/task.c
  82. 69 47
      src/core/workers.c
  83. 2 1
      src/core/workers.h
  84. 1 0
      src/datawizard/coherency.h
  85. 52 16
      src/datawizard/data_request.c
  86. 23 1
      src/datawizard/filters.c
  87. 14 9
      src/datawizard/interfaces/data_interface.c
  88. 1 5
      src/datawizard/user_interactions.c
  89. 1 1
      src/drivers/cpu/driver_cpu.c
  90. 1 1
      src/drivers/cuda/driver_cuda.c
  91. 0 27
      src/drivers/driver_common/driver_common.c
  92. 1 1
      src/drivers/mic/driver_mic_source.c
  93. 1 1
      src/drivers/opencl/driver_opencl.c
  94. 1 1
      src/drivers/scc/driver_scc_source.c
  95. 14 10
      src/sched_policies/eager_central_policy.c
  96. 8 10
      src/sched_policies/eager_central_priority_policy.c
  97. 20 23
      src/sched_policies/parallel_heft.c
  98. 8 4
      src/sched_policies/work_stealing_policy.c
  99. 33 5
      src/util/starpu_insert_task.c
  100. 0 0
      src/util/starpu_insert_task_utils.c

+ 12 - 2
ChangeLog

@@ -52,8 +52,6 @@ New features:
     scheduled.
 
 Small features:
-  * Add cl_arg_free field to enable automatic free(cl_arg) on task
-    destroy.
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
@@ -64,6 +62,13 @@ Small features:
     the tool starpu_perfmodel_display
   * New batch files to execute StarPU applications under Microsoft
     Visual Studio (They are installed in path_to_starpu/bin/mvsc)/
+  * Add cl_arg_free, callback_arg_free, prologue_callback_arg_free fields to
+    enable automatic free(cl_arg); free(callback_arg);
+    free(prologue_callback_arg) on task destroy.
+  * New function starpu_task_build
+  * Functions starpu_insert_task and starpu_mpi_insert_task are
+    renamed in starpu_task_insert and starpu_mpi_task_insert. Old
+    names are kept to avoid breaking old codes.
 
 Changes:
   * Fix of the livelock issue discovered while executing applications
@@ -72,6 +77,11 @@ Changes:
   * Data interfaces (variable, vector, matrix and block) now define
     pack und unpack functions
   * Fix for properly dealing with NAN on windows systems
+  * StarPU-MPI: Fix for being able to receive data which have not yet
+    been registered by the application (i.e it did not call
+    starpu_data_set_tag(), data are received as a raw memory)
+  * StarPU-MPI: Fix for being able to receive data with the same tag
+    from several nodes (see mpi/tests/gather.c)
 
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================

+ 11 - 3
configure.ac

@@ -54,8 +54,8 @@ AC_CANONICAL_SYSTEM
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl when they're available.
 m4_ifdef([AM_SILENT_RULES],
-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
-  [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
+  [AM_INIT_AUTOMAKE([1.11 -Wall foreign silent-rules color-tests parallel-tests])],
+  [AM_INIT_AUTOMAKE([1.10 -Wall foreign])])
 
 m4_ifdef([AM_SILENT_RULES],
   [AM_SILENT_RULES(yes)])
@@ -258,6 +258,8 @@ AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to
 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
 AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
 
+AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
+
 AC_CHECK_HEADERS([aio.h])
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
@@ -1291,14 +1293,20 @@ AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug], [enable debug mode])],
 			enable_debug=$enableval, enable_debug=no)
 AC_MSG_RESULT($enable_debug)
 
+AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
+
 if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
-	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+	enable_spinlock_check=yes
 else
 	CFLAGS="-O3 $CFLAGS"
 fi
 CFLAGS+=" -gdwarf-2 -g3 "
 
+if test x$enable_spinlock_check = xyes; then
+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+fi
+
 AC_MSG_CHECKING(whether extra checks should be performed)
 AC_ARG_ENABLE(fast, [AS_HELP_STRING([--enable-fast],
 			[do not enforce assertions])],

+ 20 - 20
doc/doxygen/chapters/advanced_examples.doxy

@@ -481,7 +481,7 @@ to a less optimal solution. This increases even more computation time.
 
 \section InsertTaskUtility Insert Task Utility
 
-StarPU provides the wrapper function starpu_insert_task() to ease
+StarPU provides the wrapper function starpu_task_insert() to ease
 the creation and submission of tasks.
 
 Here the implementation of the codelet:
@@ -508,17 +508,17 @@ struct starpu_codelet mycodelet = {
 };
 \endcode
 
-And the call to the function starpu_insert_task():
+And the call to the function starpu_task_insert():
 
 \code{.c}
-starpu_insert_task(&mycodelet,
+starpu_task_insert(&mycodelet,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
                    STARPU_VALUE, &ffactor, sizeof(ffactor),
                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
                    0);
 \endcode
 
-The call to starpu_insert_task() is equivalent to the following
+The call to starpu_task_insert() is equivalent to the following
 code:
 
 \code{.c}
@@ -540,7 +540,7 @@ int ret = starpu_task_submit(task);
 Here a similar call using ::STARPU_DATA_ARRAY.
 
 \code{.c}
-starpu_insert_task(&mycodelet,
+starpu_task_insert(&mycodelet,
                    STARPU_DATA_ARRAY, data_handles, 2,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
                    STARPU_VALUE, &ffactor, sizeof(ffactor),
@@ -554,11 +554,11 @@ instance, assuming that the index variable <c>i</c> was registered as handle
 
 \code{.c}
 /* Compute which portion we will work on, e.g. pivot */
-starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
 
 /* And submit the corresponding task */
 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
-                       starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
 \endcode
 
 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
@@ -637,7 +637,7 @@ dot products with partitioned vectors:
 
 \code{.c}
 for (b = 0; b < nblocks; b++)
-    starpu_insert_task(&dot_kernel_cl,
+    starpu_task_insert(&dot_kernel_cl,
         STARPU_REDUX, dtq_handle,
         STARPU_R, starpu_data_get_sub_data(v1, 1, b),
         STARPU_R, starpu_data_get_sub_data(v2, 1, b),
@@ -659,9 +659,9 @@ the initial status <c>register(NULL)</c>.
 The example <c>cg</c> also uses reduction for the blocked gemv kernel,
 leading to yet more relaxed dependencies and more parallelism.
 
-::STARPU_REDUX can also be passed to starpu_mpi_insert_task() in the MPI
+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
 case. That will however not produce any MPI communication, but just pass
-::STARPU_REDUX to the underlying starpu_insert_task(). It is up to the
+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
 application to call starpu_mpi_redux_data(), which posts tasks that will
 reduce the partial results among MPI nodes into the MPI node which owns the
 data. For instance, some hypothetical application which collects partial results
@@ -670,11 +670,11 @@ with a new reduction:
 
 \code{.c}
 for (i = 0; i < 100; i++) {
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
                STARPU_R, B, STARPU_REDUX, res, 0);
     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
 }
 \endcode
 
@@ -705,9 +705,9 @@ unregistration.
 
 \code{.c}
 starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
-starpu_insert_task(&produce_data, STARPU_W, handle, 0);
-starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
-starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
 starpu_data_unregister_submit(handle);
 \endcode
 
@@ -725,7 +725,7 @@ provides per-worker buffers without content consistency.
 \code{.c}
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
 for (i = 0; i < N; i++)
-    starpu_insert_task(&compute, STARPU_R, input[i],
+    starpu_task_insert(&compute, STARPU_R, input[i],
                        STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
 \endcode
 
@@ -1028,7 +1028,7 @@ starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
                             output, num_bytes / sizeof(float4), sizeof(float4));
 
 /* The handle can now be used as usual */
-starpu_insert_task(&cl, STARPU_RW, handle, 0);
+starpu_task_insert(&cl, STARPU_RW, handle, 0);
 
 /* ... */
 
@@ -1122,7 +1122,7 @@ Complex data interfaces can then be registered to StarPU.
 \code{.c}
 double real = 45.0;
 double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
-starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
 \endcode
 
 and used by codelets.
@@ -1186,7 +1186,7 @@ for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 {
 	handles[i] = handle;
 }
-starpu_insert_task(&dummy_big_cl,
+starpu_task_insert(&dummy_big_cl,
         	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
 		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
 		 0);

+ 17 - 5
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -406,20 +406,32 @@ Optional field, the default value is <c>NULL</c>. This is the pointer
 passed to the callback function. This field is ignored if the field
 starpu_task::callback_func is set to <c>NULL</c>.
 
-\var starpu_task::prologue_func
+\var starpu_task::callback_arg_free
+Optional field. In case starpu_task::callback_arg was allocated by the
+application through <c>malloc()</c>, setting starpu_task::callback_arg_free
+to 1 makes StarPU automatically call <c>free(callback_arg)</c> when
+destroying the task.
+
+\var starpu_task::prologue_callback_func
 Optional field, the default value is <c>NULL</c>. This is a function
 pointer of prototype <c>void (*f)(void *)</c> which specifies a
 possible callback. 
 If this pointer is non-null, the callback function
 is executed on the host when the task becomes ready for execution,
 before getting scheduled. The callback is passed the
-value contained in the starpu_task::prologue_arg field. No callback is
+value contained in the starpu_task::prologue_callback_arg field. No callback is
 executed if the field is set to NULL.
 
-\var starpu_task::prologue_arg (optional) (default: NULL)
+\var starpu_task::prologue_callback_arg (optional) (default: NULL)
 Optional field, the default value is <c>NULL</c>. This is the pointer
-passed to the prologue function. This field is ignored if the field
-starpu_task::prologue_func is set to <c>NULL</c>.
+passed to the prologue callback function. This field is ignored if the field
+starpu_task::prologue_callback_func is set to <c>NULL</c>.
+
+\var starpu_task::prologue_callback_arg_free
+Optional field. In case starpu_task::prologue_callback_arg was allocated by the
+application through <c>malloc()</c>, setting starpu_task::prologue_callback_arg_free
+to 1 makes StarPU automatically call <c>free(prologue_callback_arg)</c> when
+destroying the task.
 
 \var starpu_task::use_tag
 Optional field, the default value is 0. If set, this flag indicates

+ 21 - 10
doc/doxygen/chapters/api/insert_task.doxy

@@ -8,7 +8,11 @@
 
 /*! \defgroup API_Insert_Task Insert_Task
 
-\fn int starpu_insert_task(struct starpu_codelet *cl, ...)
+\def starpu_insert_task
+\ingroup API_Insert_Task
+Convenience macro for the function starpu_task_insert() which used to be called starpu_insert_task.
+
+\fn int starpu_task_insert(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task
 Create and submit a task corresponding to \p cl with the
 following arguments. The argument list must be zero-terminated.
@@ -35,18 +39,18 @@ implementation to retrieve them.
 
 \def STARPU_VALUE
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to a constant value and the size of the
 constant
 
 \def STARPU_CALLBACK
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to a callback function
 
 \def STARPU_CALLBACK_WITH_ARG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by two pointers: one to a callback function, and the other
 to be given as an argument to the callback function; this is
 equivalent to using both ::STARPU_CALLBACK and
@@ -54,13 +58,13 @@ equivalent to using both ::STARPU_CALLBACK and
 
 \def STARPU_CALLBACK_ARG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to be given as an argument to the callback
 function
 
 \def STARPU_PRIORITY
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a integer defining a priority level
 
 \def STARPU_DATA_ARRAY
@@ -69,18 +73,18 @@ TODO
 
 \def STARPU_TAG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must be followed by a tag.
+this macro is used when calling starpu_task_insert(), and must be followed by a tag.
 
 \def STARPU_FLOPS
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by an amount of floating point operations, as a double.
 Users <b>MUST</b> explicitly cast into double, otherwise parameter
 passing will not work.
 
 \def STARPU_SCHED_CTX
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by the id of the scheduling context to which we want to
 submit the task.
 
@@ -93,6 +97,13 @@ starpu_codelet_unpack_args().
 \fn void starpu_codelet_unpack_args(void *cl_arg, ...)
 \ingroup API_Insert_Task
 Retrieve the arguments of type ::STARPU_VALUE associated to a
-task automatically created using the function starpu_insert_task().
+task automatically created using the function starpu_task_insert().
+
+\fn struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
+\ingroup API_Insert_Task
+Create a task corresponding to \p cl with the following arguments.
+The argument list must be zero-terminated. The arguments
+following the codelet are the same as the ones for the function
+starpu_task_insert().
 
 */

+ 8 - 4
doc/doxygen/chapters/api/mpi.doxy

@@ -204,23 +204,27 @@ Returns the last value set by starpu_data_set_rank().
 
 \def STARPU_EXECUTE_ON_NODE
 \ingroup API_MPI_Support
-this macro is used when calling starpu_mpi_insert_task(), and must be
+this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a integer value which specified the node on which to
 execute the codelet.
 
 \def STARPU_EXECUTE_ON_DATA
 \ingroup API_MPI_Support
-this macro is used when calling starpu_mpi_insert_task(), and must be
+this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a data handle to specify that the node owning the given
 data will execute the codelet.
 
-\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+\def starpu_mpi_insert_task
+\ingroup API_MPI_Support
+Convenience macro for the function starpu_mpi_task_insert() which used to be called starpu_mpi_insert_task.
+
+\fn int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
 Create and submit a task corresponding to codelet with the following
 arguments. The argument list must be zero-terminated.
 
 The arguments following the codelets are the same types as for the
-function starpu_insert_task(). The extra argument
+function starpu_task_insert(). The extra argument
 ::STARPU_EXECUTE_ON_NODE followed by an integer allows to specify the
 MPI node to execute the codelet. It is also possible to specify that
 the node owning a specific data will execute the codelet, by using

+ 7 - 0
doc/doxygen/chapters/configure_options.doxy

@@ -22,6 +22,13 @@ the following configure options.
 Enable debugging messages.
 </dd>
 
+<dt>--enable-spinlock-check</dt>
+<dd>
+\anchor enable-spinlock-check
+\addindex __configure__--enable-spinlock-check
+Enable checking that spinlocks are taken and released properly.
+</dd>
+
 <dt>--enable-fast</dt>
 <dd>
 \anchor enable-fast

+ 6 - 6
doc/doxygen/chapters/mpi_support.doxy

@@ -232,7 +232,7 @@ task, and trigger the required MPI transfers.
 
 The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
 
-Here an stencil example showing how to use starpu_mpi_insert_task(). One
+Here an stencil example showing how to use starpu_mpi_task_insert(). One
 first needs to define a distribution function which specifies the
 locality of the data. Note that that distribution information needs to
 be given to StarPU by calling starpu_data_set_rank(). A MPI tag
@@ -291,14 +291,14 @@ data which will be needed by the tasks that we will execute.
     }
 \endcode
 
-Now starpu_mpi_insert_task() can be called for the different
+Now starpu_mpi_task_insert() can be called for the different
 steps of the application.
 
 \code{.c}
     for(loop=0 ; loop<niter; loop++)
         for (x = 1; x < X-1; x++)
             for (y = 1; y < Y-1; y++)
-                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
+                starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl,
                                        STARPU_RW, data_handles[x][y],
                                        STARPU_R, data_handles[x-1][y],
                                        STARPU_R, data_handles[x+1][y],
@@ -365,7 +365,7 @@ for(x = 0; x < nblocks ;  x++) {
     if (data_handles[x]) {
         int owner = starpu_data_get_rank(data_handles[x]);
         if (owner == rank) {
-            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
+            starpu_task_insert(&cl, STARPU_RW, data_handles[x], 0);
         }
     }
 }
@@ -383,9 +383,9 @@ MPI examples are available in the StarPU source code in mpi/examples:
 <ul>
 <li><c>complex</c> is a simple example using a user-define data interface over
 MPI (complex numbers),
-<li><c>stencil5</c> is a simple stencil example using <c>starpu_mpi_insert_task</c>,
+<li><c>stencil5</c> is a simple stencil example using starpu_mpi_task_insert(),
 <li><c>matrix_decomposition</c> is a cholesky decomposition example using
-<c>starpu_mpi_insert_task</c>. The non-distributed version can check for
+starpu_mpi_task_insert(). The non-distributed version can check for
 <algorithm correctness in 1-node configuration, the distributed version uses
 exactly the same source code, to be used over MPI,
 <li><c>mpi_lu</c> is an LU decomposition example, provided in three versions:

+ 5 - 5
doc/doxygen/chapters/scheduling_context_hypervisor.doxy

@@ -76,7 +76,7 @@ The <b>Application driven</b> strategy uses the user's input concerning the mome
 Thus, the users tags the task that should trigger the resizing
 process. We can set directly the field starpu_task::hypervisor_tag or
 use the macro ::STARPU_HYPERVISOR_TAG in the function
-starpu_insert_task().
+starpu_task_insert().
 
 \code{.c}
 task.hypervisor_tag = 2;
@@ -85,7 +85,7 @@ task.hypervisor_tag = 2;
 or
 
 \code{.c}
-starpu_insert_task(&codelet,
+starpu_task_insert(&codelet,
 		    ...,
 		    STARPU_HYPERVISOR_TAG, 2,
                     0);
@@ -131,7 +131,7 @@ The number of flops to be executed by a context are passed as
  (<c>sc_hypervisor_register_ctx(sched_ctx_id, flops)</c>) and the one
  to be executed by each task are passed when the task is submitted.
  The corresponding field is starpu_task::flops and the corresponding
- macro in the function starpu_insert_task() is ::STARPU_FLOPS
+ macro in the function starpu_task_insert() is ::STARPU_FLOPS
  (<b>Caution</b>: but take care of passing a double, not an integer,
  otherwise parameter passing will be bogus). When the task is executed
  the resizing process is triggered.
@@ -143,7 +143,7 @@ task.flops = 100;
 or
 
 \code{.c}
-starpu_insert_task(&codelet,
+starpu_task_insert(&codelet,
                     ...,
                     STARPU_FLOPS, (double) 100,
                     0);
@@ -215,4 +215,4 @@ struct sc_hypervisor_policy dummy_policy =
 \endcode
 
 
-*/
+*/

+ 1 - 1
doc/doxygen/chapters/scheduling_contexts.doxy

@@ -97,7 +97,7 @@ the current thread will submit tasks to the coresponding context.
 When the application may not assign a thread of submission to each
 context, the id of the context must be indicated by using the
 function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
-for <c>starpu_insert_task</c>.
+for starpu_task_insert().
 
 \section DeletingAContext Deleting A Context
 

+ 4 - 4
examples/basic_examples/dynamic_handles.c

@@ -112,12 +112,12 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	ret = starpu_insert_task(&dummy_small_cl,
+	ret = starpu_task_insert(&dummy_small_cl,
 				 STARPU_VALUE, &dummy_small_cl.nbuffers, sizeof(dummy_small_cl.nbuffers),
 				 STARPU_RW, handle,
 				 0);
 	if (ret == -ENODEV) goto enodev;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
         ret = starpu_task_wait_for_all();
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
@@ -126,12 +126,12 @@ int main(int argc, char **argv)
 	{
 		handles[i] = handle;
 	}
-	ret = starpu_insert_task(&dummy_big_cl,
+	ret = starpu_task_insert(&dummy_big_cl,
 				 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
 				 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
 				 0);
 	if (ret == -ENODEV) goto enodev;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
         ret = starpu_task_wait_for_all();
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	free(handles);

+ 1 - 1
examples/binary/binary.c

@@ -67,7 +67,7 @@ int compute(char *file_name, int load_as_file)
 
 	for (i = 0; i < niter; i++)
 	{
-		ret = starpu_insert_task(&cl, STARPU_RW, float_array_handle, 0);
+		ret = starpu_task_insert(&cl, STARPU_RW, float_array_handle, 0);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");

+ 3 - 1
examples/callback/prologue.c

@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 
 	double *x = (double*)malloc(sizeof(double));
 	*x = -999.0;
-	int ret2 = starpu_insert_task(&cl,
+	int ret2 = starpu_task_insert(&cl,
 				      STARPU_RW, handle,
 				      STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
 				      STARPU_PROLOGUE_CALLBACK_ARG, x,
@@ -92,6 +92,8 @@ int main(int argc, char **argv)
 
 	FPRINTF(stderr, "v -> %d\n", v);
 
+	free(x);
+
 	starpu_shutdown();
 
 	return 0;

+ 12 - 12
examples/cg/cg_kernels.c

@@ -288,20 +288,20 @@ int dot_kernel(starpu_data_handle_t v1,
 	if (use_reduction)
 		starpu_data_invalidate_submit(s);
 	else {
-		ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&dot_kernel_cl,
+		ret = starpu_task_insert(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
 					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
 					 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }
@@ -442,12 +442,12 @@ int gemv_kernel(starpu_data_handle_t v1,
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		ret = starpu_insert_task(&scal_kernel_cl,
+		ret = starpu_task_insert(&scal_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -455,14 +455,14 @@ int gemv_kernel(starpu_data_handle_t v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			ret = starpu_insert_task(&gemv_kernel_cl,
+			ret = starpu_task_insert(&gemv_kernel_cl,
 						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
 						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
 						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 	}
 	return 0;
@@ -535,14 +535,14 @@ int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&scal_axpy_kernel_cl,
+		ret = starpu_task_insert(&scal_axpy_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }
@@ -609,13 +609,13 @@ int axpy_kernel(starpu_data_handle_t v1,
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&axpy_kernel_cl,
+		ret = starpu_task_insert(&axpy_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }

+ 7 - 7
examples/cholesky/cholesky_implicit.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -99,27 +99,27 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	{
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
-                ret = starpu_insert_task(&cl11,
+                ret = starpu_task_insert(&cl11,
 					 STARPU_PRIORITY, prio_level,
 					 STARPU_RW, sdatakk,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 					 0);
 		if (ret == -ENODEV) return 77;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 		for (j = k+1; j<nblocks; j++)
 		{
                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        ret = starpu_insert_task(&cl21,
+                        ret = starpu_task_insert(&cl21,
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						 0);
 			if (ret == -ENODEV) return 77;
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -128,7 +128,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
 					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
 
-					ret = starpu_insert_task(&cl22,
+					ret = starpu_task_insert(&cl22,
 								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								 STARPU_R, sdataki,
 								 STARPU_R, sdatakj,
@@ -136,7 +136,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 0);
 					if (ret == -ENODEV) return 77;
-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
 			}
 		}

+ 1 - 1
examples/cpp/incrementer_cpp.cpp

@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < niter; i++)
 	{
-		ret = starpu_insert_task(&cl,
+		ret = starpu_task_insert(&cl,
 					 STARPU_RW, float_array_handle,
 					 0);
                 if (STARPU_UNLIKELY(ret == -ENODEV))

+ 14 - 16
examples/interface/complex.c

@@ -18,8 +18,6 @@
 #include "complex_interface.h"
 #include "complex_codelet.h"
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
@@ -95,21 +93,21 @@ int main(int argc, char **argv)
 	starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
 	starpu_complex_data_register(&handle2, STARPU_MAIN_RAM, &copy_real, &copy_imaginary, 1);
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_compare,
+	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	starpu_task_wait_for_all();
 	if (compare != 0)
 	{
@@ -117,28 +115,28 @@ int main(int argc, char **argv)
 	     goto end;
 	}
 
-	ret = starpu_insert_task(&cl_copy,
+	ret = starpu_task_insert(&cl_copy,
 				 STARPU_R, handle1,
 				 STARPU_W, handle2,
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_compare,
+	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	starpu_task_wait_for_all();
 

+ 3 - 1
examples/interface/complex_codelet.h

@@ -20,6 +20,8 @@
 #ifndef __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 void compare_complex_codelet(void *descr[], void *_args)
 {
 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
@@ -70,7 +72,7 @@ void display_complex_codelet(void *descr[], void *_args)
 
 	for(i=0 ; i<nx ; i++)
 	{
-		fprintf(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
+		FPRINTF(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
 	}
 }
 

+ 3 - 3
examples/mandelbrot/mandelbrot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -529,7 +529,7 @@ int main(int argc, char **argv)
 			per_block_cnt[iby] = 0;
 			int *pcnt = &per_block_cnt[iby];
 
-			ret = starpu_insert_task(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
+			ret = starpu_task_insert(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
 						 STARPU_VALUE, &iby, sizeof(iby),
 						 STARPU_VALUE, &block_size, sizeof(block_size),
 						 STARPU_VALUE, &stepX, sizeof(stepX),
@@ -537,7 +537,7 @@ int main(int argc, char **argv)
 						 STARPU_W, block_handles[iby],
 						 STARPU_VALUE, &pcnt, sizeof(int *),
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 
 		for (iby = 0; iby < nblocks; iby++)

+ 8 - 8
examples/pipeline/pipeline.c

@@ -200,33 +200,33 @@ int main(void)
 			sem_wait(&sems[l%C]);
 
 		/* Now submit the next stage */
-		ret = starpu_insert_task(&pipeline_codelet_x,
+		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersX[l%K],
 				STARPU_VALUE, &x, sizeof(x),
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task x");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert x");
 
-		ret = starpu_insert_task(&pipeline_codelet_x,
+		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersY[l%K],
 				STARPU_VALUE, &y, sizeof(y),
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task y");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert y");
 
-		ret = starpu_insert_task(&pipeline_codelet_axpy,
+		ret = starpu_task_insert(&pipeline_codelet_axpy,
 				STARPU_R, buffersX[l%K],
 				STARPU_RW, buffersY[l%K],
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task axpy");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert axpy");
 
-		ret = starpu_insert_task(&pipeline_codelet_sum,
+		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
 				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task sum");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert sum");
 	}
 	starpu_task_wait_for_all();
 

+ 6 - 6
gcc-plugin/src/tasks.c

@@ -525,7 +525,7 @@ declare_codelet (tree task_decl)
   return cl_decl;
 }
 
-/* Build the body of TASK_DECL, which will call `starpu_insert_task'.  */
+/* Build the body of TASK_DECL, which will call `starpu_task_insert'.  */
 
 void
 define_task (tree task_decl)
@@ -583,22 +583,22 @@ define_task (tree task_decl)
   /* Introduce a local variable to hold the error code.  */
 
   tree error_var = build_decl (loc, VAR_DECL,
-  			       create_tmp_var_name (".insert_task_error"),
+  			       create_tmp_var_name (".task_insert_error"),
   			       integer_type_node);
   DECL_CONTEXT (error_var) = task_decl;
   DECL_ARTIFICIAL (error_var) = true;
 
   /* Build this:
 
-       err = starpu_insert_task (...);
+       err = starpu_task_insert (...);
        if (err != 0)
          { printf ...; abort (); }
    */
 
-  static tree insert_task_fn;
-  LOOKUP_STARPU_FUNCTION (insert_task_fn, "starpu_insert_task");
+  static tree task_insert_fn;
+  LOOKUP_STARPU_FUNCTION (task_insert_fn, "starpu_task_insert");
 
-  tree call = build_call_expr_loc_vec (loc, insert_task_fn, args);
+  tree call = build_call_expr_loc_vec (loc, task_insert_fn, args);
 
   tree assignment = build2 (INIT_EXPR, TREE_TYPE (error_var),
   			    error_var, call);

+ 4 - 4
gcc-plugin/tests/base.c

@@ -106,7 +106,7 @@ main (int argc, char *argv[])
   unsigned char y = 77;
   long y_as_long_int = 77;
 
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { STARPU_VALUE, &y, sizeof y },
@@ -114,7 +114,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
+  expected_task_insert_arguments = expected;
 
   /* Invoke the task, which should make sure it gets called with
      EXPECTED.  */
@@ -135,14 +135,14 @@ main (int argc, char *argv[])
 
   assert (tasks_submitted == 9);
 
-  struct insert_task_argument expected2[] =
+  struct task_insert_argument expected2[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { 0, 0, 0 }
     };
 
   tasks_submitted = 0;
-  expected_insert_task_arguments = expected2;
+  expected_task_insert_arguments = expected2;
 
   my_other_task (42);
   assert (tasks_submitted == 1);

+ 2 - 2
gcc-plugin/tests/lib-user.c

@@ -38,7 +38,7 @@ main (int argc, char *argv[])
   static const char forty_two = 42;
   static const int  sizeof_x = sizeof x;
 
-  struct insert_task_argument expected_pointer_task[] =
+  struct task_insert_argument expected_pointer_task[] =
     {
       { STARPU_VALUE, &forty_two, sizeof forty_two },
       { STARPU_R,  x },
@@ -47,7 +47,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_pointer_task;
+  expected_task_insert_arguments = expected_pointer_task;
 
   expected_register_arguments.pointer = (void *) x;
   expected_register_arguments.elements = sizeof x / sizeof x[0];

+ 10 - 10
gcc-plugin/tests/mocks.h

@@ -62,7 +62,7 @@ typedef double         cl_double;
 /* Number of tasks submitted.  */
 static unsigned int tasks_submitted;
 
-struct insert_task_argument
+struct task_insert_argument
 {
   /* `STARPU_VALUE', etc. */
   int type;
@@ -75,18 +75,18 @@ struct insert_task_argument
 };
 
 /* Pointer to a zero-terminated array listing the expected
-   `starpu_insert_task' arguments.  */
-const struct insert_task_argument *expected_insert_task_arguments;
+   `starpu_task_insert' arguments.  */
+const struct task_insert_argument *expected_task_insert_arguments;
 
 /* Expected targets of the codelets submitted.  */
-static int expected_insert_task_targets = STARPU_CPU | STARPU_OPENCL;
+static int expected_task_insert_targets = STARPU_CPU | STARPU_OPENCL;
 
 
 int
-starpu_insert_task (struct starpu_codelet *cl, ...)
+starpu_task_insert (struct starpu_codelet *cl, ...)
 {
   assert (cl->name != NULL && strlen (cl->name) > 0);
-  assert (cl->where == expected_insert_task_targets);
+  assert (cl->where == expected_task_insert_targets);
 
   assert ((cl->where & STARPU_CPU) == 0
 	  ? cl->cpu_funcs[0] == NULL
@@ -106,8 +106,8 @@ starpu_insert_task (struct starpu_codelet *cl, ...)
 
   va_start (args, cl);
 
-  const struct insert_task_argument *expected;
-  for (expected = expected_insert_task_arguments,
+  const struct task_insert_argument *expected;
+  for (expected = expected_task_insert_arguments,
 	 cl_args_offset = 1, scalars = 0, pointers = 0;
        expected->type != 0;
        expected++)
@@ -528,9 +528,9 @@ clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
 		const void *value)
 {
   size_t n;
-  const struct insert_task_argument *arg;
+  const struct task_insert_argument *arg;
 
-  for (n = 0, arg = expected_insert_task_arguments;
+  for (n = 0, arg = expected_task_insert_arguments;
        n < index;
        n++, arg++)
     assert (arg->pointer != NULL);

+ 3 - 3
gcc-plugin/tests/opencl.c

@@ -46,15 +46,15 @@ main ()
 #pragma starpu register a
 
   static int x = 123;
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { STARPU_RW, a },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
-  expected_insert_task_targets = STARPU_OPENCL;
+  expected_task_insert_arguments = expected;
+  expected_task_insert_targets = STARPU_OPENCL;
   size_t y = 8; expected_cl_enqueue_kernel_arguments.global_work_size = &y;
 
   my_task (123, a);

+ 2 - 2
gcc-plugin/tests/output-pointer.c

@@ -84,14 +84,14 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof x[0];
   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) x, 42, sizeof x[0]);
 
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &size, sizeof size },
       { STARPU_W, x },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
+  expected_task_insert_arguments = expected;
 
   /* Invoke the task, which makes sure it gets called with EXPECTED.  */
   my_pointer_task (size, x);

+ 4 - 4
gcc-plugin/tests/pointers.c

@@ -92,14 +92,14 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof *y;
   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) y, 1, sizeof *y);
 
-  struct insert_task_argument expected_pointer_task[] =
+  struct task_insert_argument expected_pointer_task[] =
     {
       { STARPU_R,  x },
       { STARPU_RW, y },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_pointer_task;
+  expected_task_insert_arguments = expected_pointer_task;
 
   /* Invoke the task, which should make sure it gets called with
      EXPECTED.  */
@@ -110,7 +110,7 @@ main (int argc, char *argv[])
 
   /* Likewise with `my_mixed_task'.  */
 
-  struct insert_task_argument expected_mixed_task[] =
+  struct task_insert_argument expected_mixed_task[] =
     {
       { STARPU_RW, x },
       { STARPU_VALUE, &z, sizeof z },
@@ -118,7 +118,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_mixed_task;
+  expected_task_insert_arguments = expected_mixed_task;
 
   my_mixed_task (x, 0x77, y);
 

+ 4 - 0
include/starpu_task.h

@@ -125,9 +125,13 @@ struct starpu_task
 
 	void (*callback_func)(void *);
 	void *callback_arg;
+	/* must StarPU release callback_arg ? - 0 by default */
+	unsigned callback_arg_free;
 
 	void (*prologue_callback_func)(void *);
 	void *prologue_callback_arg;
+	/* must StarPU release prologue_callback_arg ? - 0 by default */
+	unsigned prologue_callback_arg_free;
 
 	unsigned use_tag;
 	starpu_tag_t tag_id;

+ 3 - 1
include/starpu_task_util.h

@@ -46,7 +46,9 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_PROLOGUE_CALLBACK   (13<<16)
 #define STARPU_PROLOGUE_CALLBACK_ARG (14<<16)
 
-int starpu_insert_task(struct starpu_codelet *cl, ...);
+struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
+int starpu_task_insert(struct starpu_codelet *cl, ...);
+#define starpu_insert_task starpu_task_insert
 
 void starpu_codelet_unpack_args(void *cl_arg, ...);
 

+ 0 - 8
include/starpu_util.h

@@ -198,14 +198,6 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 
-/* This is needed in some places to make valgrind yield to another thread to be
- * able to progress.  */
-#if defined(__i386__) || defined(__x86_64__)
-#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
-#else
-#define STARPU_UYIELD() ((void)0)
-#endif
-
 #ifdef __cplusplus
 }
 #endif

+ 7 - 5
mpi/examples/complex/mpi_complex.c

@@ -18,10 +18,12 @@
 #include <interface/complex_interface.h>
 #include <interface/complex_codelet.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	fprintf(stderr, "foo = %d\n", *foo);
+	FPRINTF(stderr, "foo = %d\n", *foo);
 }
 
 struct starpu_codelet foo_display =
@@ -75,17 +77,17 @@ int main(int argc, char **argv)
 		{
 			int *compare_ptr = &compare;
 
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value"), STARPU_R, handle, 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value"), STARPU_R, handle, 0);
 			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
 			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
 
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value"), STARPU_R, handle2, 0);
-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value"), STARPU_R, handle2, 0);
+			starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
 		}
 		else if (rank == 1)
 		{
 			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value"), STARPU_R, handle, 0);
+			starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value"), STARPU_R, handle, 0);
 			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
 		}
 

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -64,8 +64,8 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
 	}
 
 	return 0;

+ 18 - 18
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -112,20 +112,20 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		int prio = STARPU_DEFAULT_PRIO;
 		if (!noprio) prio = STARPU_MAX_PRIO;
 
-		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
-				STARPU_PRIORITY, prio,
-				STARPU_RW, data_handles[k][k],
-				0);
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, prio,
+				       STARPU_RW, data_handles[k][k],
+				       0);
 
 		for (j = k+1; j<nblocks; j++)
 		{
 			prio = STARPU_DEFAULT_PRIO;
 			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
-			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
-					STARPU_PRIORITY, prio,
-					STARPU_R, data_handles[k][k],
-					STARPU_RW, data_handles[k][j],
-					0);
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, prio,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[k][j],
+					       0);
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -133,12 +133,12 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 				{
 					prio = STARPU_DEFAULT_PRIO;
 					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
-					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
-							STARPU_PRIORITY, prio,
-							STARPU_R, data_handles[k][i],
-							STARPU_R, data_handles[k][j],
-							STARPU_RW, data_handles[i][j],
-							0);
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, prio,
+							       STARPU_R, data_handles[k][i],
+							       STARPU_R, data_handles[k][j],
+							       STARPU_RW, data_handles[i][j],
+							       0);
 				}
 			}
 		}
@@ -186,7 +186,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 		}
 	}
 
-	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
+	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
@@ -203,7 +203,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	SSYRK("L", "N", size, size, 1.0f,
 			rmat, size, 0.0f, test_mat, size);
 
-	fprintf(stderr, "[%d] comparing results ...\n", rank);
+	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	if (display)
 	{
 		for (j = 0; j < size; j++)
@@ -241,7 +241,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 							float err = abs(test_mat[j +i*size] - orig);
 							if (err > 0.00001)
 							{
-								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
 								*correctness = 0;
 								*flops = 0;
 								break;

+ 4 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -22,6 +22,8 @@
 #include "mpi_decomposition_matrix.h"
 #include "mpi_decomposition_params.h"
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 int main(int argc, char **argv)
 {
 	/* create a simple definite positive symetric matrix example
@@ -56,8 +58,8 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
 	}
 
 	return 0;

+ 3 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_models.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 
 #include <starpu.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 extern struct starpu_perfmodel chol_model_11;
 extern struct starpu_perfmodel chol_model_21;
 extern struct starpu_perfmodel chol_model_22;

+ 42 - 42
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -39,15 +39,15 @@ struct callback_arg {
 
 static void create_task_11(unsigned k)
 {
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			&STARPU_PLU(cl11),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_PRIORITY, !no_prio ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       &STARPU_PLU(cl11),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_PRIORITY, !no_prio ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
 }
 
 /*
@@ -57,17 +57,17 @@ static void create_task_11(unsigned k)
 static void create_task_12(unsigned k, unsigned j)
 {
 #warning temporary fix 
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			//&STARPU_PLU(cl12),
-			&STARPU_PLU(cl21),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
-			STARPU_PRIORITY, !no_prio && (j == k+1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       //&STARPU_PLU(cl12),
+			       &STARPU_PLU(cl21),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
+			       STARPU_PRIORITY, !no_prio && (j == k+1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
 }
 
 /*
@@ -77,17 +77,17 @@ static void create_task_12(unsigned k, unsigned j)
 static void create_task_21(unsigned k, unsigned i)
 {
 #warning temporary fix 
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			//&STARPU_PLU(cl21),
-			&STARPU_PLU(cl12),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
-			STARPU_PRIORITY, !no_prio && (i == k+1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       //&STARPU_PLU(cl21),
+			       &STARPU_PLU(cl12),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
+			       STARPU_PRIORITY, !no_prio && (i == k+1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
 }
 
 /*
@@ -96,17 +96,17 @@ static void create_task_21(unsigned k, unsigned i)
 
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 {
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			&STARPU_PLU(cl22),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, j),
-			STARPU_R, STARPU_PLU(get_block_handle)(i, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
-			STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       &STARPU_PLU(cl22),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, j),
+			       STARPU_R, STARPU_PLU(get_block_handle)(i, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
+			       STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
 }
 
 /*

+ 11 - 9
mpi/examples/stencil/stencil5.c

@@ -17,6 +17,8 @@
 #include <starpu_mpi.h>
 #include <math.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	unsigned *xy = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -25,7 +27,7 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
 	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
 
-	//fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
+	//FPRINTF(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
 	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
 }
 
@@ -107,14 +109,14 @@ int main(int argc, char **argv)
 			int mpi_rank = my_distrib(x, y, size);
 			if (mpi_rank == my_rank)
 			{
-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
 				starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
 			}
 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
 			{
 				/* I don't own that index, but will need it for my computations */
-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
 			}
 			else
@@ -136,14 +138,14 @@ int main(int argc, char **argv)
 		{
 			for (y = 1; y < Y-1; y++)
 			{
-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
 						       0);
 			}
 		}
 	}
-	fprintf(stderr, "Waiting ...\n");
+	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 
 	for(x = 0; x < X; x++)
@@ -162,15 +164,15 @@ int main(int argc, char **argv)
 
 	if (display)
 	{
-		fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
+		FPRINTF(stdout, "[%d] mean=%d\n", my_rank, mean);
 		for(x = 0; x < X; x++)
 		{
-			fprintf(stdout, "[%d] ", my_rank);
+			FPRINTF(stdout, "[%d] ", my_rank);
 			for (y = 0; y < Y; y++)
 			{
-				fprintf(stdout, "%3u ", matrix[x][y]);
+				FPRINTF(stdout, "%3u ", matrix[x][y]);
 			}
-			fprintf(stdout, "\n");
+			FPRINTF(stdout, "\n");
 		}
 	}
 

+ 2 - 1
mpi/include/starpu_mpi.h

@@ -47,7 +47,8 @@ int starpu_mpi_initialize(void) STARPU_DEPRECATED;
 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
 int starpu_mpi_shutdown(void);
 
-int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+#define starpu_mpi_insert_task starpu_mpi_task_insert
 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);

+ 2 - 2
mpi/src/Makefile.am

@@ -36,14 +36,14 @@ noinst_HEADERS =					\
 	starpu_mpi_private.h				\
 	starpu_mpi_fxt.h				\
 	starpu_mpi_stats.h				\
-	starpu_mpi_insert_task.h			\
+	starpu_mpi_task_insert.h			\
 	starpu_mpi_datatype.h
 
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi.c					\
 	starpu_mpi_helper.c				\
 	starpu_mpi_datatype.c				\
-	starpu_mpi_insert_task.c			\
+	starpu_mpi_task_insert.c			\
 	starpu_mpi_collective.c				\
 	starpu_mpi_stats.c				\
 	starpu_mpi_private.c

+ 267 - 121
mpi/src/starpu_mpi.c

@@ -21,7 +21,7 @@
 #include <starpu_mpi_private.h>
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
-#include <starpu_mpi_insert_task.h>
+#include <starpu_mpi_task_insert.h>
 #include <common/config.h>
 #include <common/thread.h>
 
@@ -38,7 +38,7 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 							int source, int mpi_tag, MPI_Comm comm,
 							unsigned detached, void (*callback)(void *), void *arg,
 							int sequential_consistency, int is_internal_req,
-							ssize_t psize);
+							ssize_t count);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
 /* The list of requests that have been newly submitted by the application */
@@ -62,31 +62,43 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
-struct _starpu_mpi_copy_handle
+LIST_TYPE(_starpu_mpi_copy_handle,
+	  starpu_data_handle_t handle;
+	  struct _starpu_mpi_envelope *env;
+	  struct _starpu_mpi_req *req;
+	  void *buffer;
+	  int mpi_tag;
+	  int source;
+	  int req_ready;
+	  starpu_pthread_mutex_t req_mutex;
+	  starpu_pthread_cond_t req_cond;
+);
+
+struct _starpu_mpi_copy_handle_hashlist
 {
-	starpu_data_handle_t handle;
-	struct _starpu_mpi_envelope *env;
-	int mpi_tag;
+	struct _starpu_mpi_copy_handle_list *list;
 	UT_hash_handle hh;
-	struct _starpu_mpi_req *req;
+	int mpi_tag;
 };
 
- /********************************************************/
- /*                                                      */
- /*  Hashmap's requests functionalities                  */
- /*                                                      */
- /********************************************************/
+/********************************************************/
+/*                                                      */
+/*  Hashmap's requests functionalities                  */
+/*                                                      */
+/********************************************************/
 
 /** stores application requests for which data have not been received yet */
-static struct _starpu_mpi_req *_starpu_mpi_app_req_hashmap = NULL;
+static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
+static int _starpu_mpi_app_req_hashmap_count = 0;
 /** stores data which have been received by MPI but have not been requested by the application */
-static struct _starpu_mpi_copy_handle *_starpu_mpi_copy_handle_hashmap = NULL;
+static struct _starpu_mpi_copy_handle_hashlist **_starpu_mpi_copy_handle_hashmap = NULL;
+static int _starpu_mpi_copy_handle_hashmap_count = 0;
 
-static struct _starpu_mpi_req* find_app_req(int mpi_tag)
+static struct _starpu_mpi_req* find_app_req(int mpi_tag, int source)
 {
 	struct _starpu_mpi_req* req;
 
-	HASH_FIND_INT(_starpu_mpi_app_req_hashmap, &mpi_tag, req);
+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
 
 	return req;
 }
@@ -95,24 +107,25 @@ static void add_app_req(struct _starpu_mpi_req *req)
 {
 	struct _starpu_mpi_req *test_req;
 
-	test_req = find_app_req(req->mpi_tag);
+	test_req = find_app_req(req->mpi_tag, req->srcdst);
 
 	if (test_req == NULL)
 	{
-		HASH_ADD_INT(_starpu_mpi_app_req_hashmap, mpi_tag, req);
-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap. \n", req, req->mpi_tag);
+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
+		_starpu_mpi_app_req_hashmap_count ++;
+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap. \n", req, req->mpi_tag);
+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
 		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
 		if (seq_const &&  req->sequential_consistency)
 		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
 		}
 		else
 		{
-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, test_req);
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
 		}
 	}
 }
@@ -121,61 +134,115 @@ static void delete_app_req(struct _starpu_mpi_req *req)
 {
 	struct _starpu_mpi_req *test_req;
 
-	test_req = find_app_req(req->mpi_tag);
+	test_req = find_app_req(req->mpi_tag, req->srcdst);
 
 	if (test_req != NULL)
 	{
-		HASH_DEL(_starpu_mpi_app_req_hashmap, req);
-		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap. \n", req, req->mpi_tag);
+		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
+		_starpu_mpi_app_req_hashmap_count --;
+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap. \n", req, req->mpi_tag);
+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
 	}
 }
 
-static struct _starpu_mpi_copy_handle* find_chandle(int mpi_tag)
+static void _starpu_mpi_copy_handle_display_hash(int source, int tag)
 {
-	struct _starpu_mpi_copy_handle* chandle;
-
-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap, &mpi_tag, chandle);
+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &tag, hashlist);
 
-	return chandle;
+	if (hashlist == NULL)
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
+	}
+	else if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
+	{
+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
+	}
+	else
+	{
+		struct _starpu_mpi_copy_handle *cur;
+		for (cur = _starpu_mpi_copy_handle_list_begin(hashlist->list) ;
+		     cur != _starpu_mpi_copy_handle_list_end(hashlist->list);
+		     cur = _starpu_mpi_copy_handle_list_next(cur))
+		{
+			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
+		}
+	}
 }
 
-static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
+static struct _starpu_mpi_copy_handle *pop_chandle(int mpi_tag, int source, int delete)
 {
-	struct _starpu_mpi_copy_handle *test_chandle;
+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
+	struct _starpu_mpi_copy_handle *chandle;
 
-	test_chandle = find_chandle(chandle->mpi_tag);
-
-	if (test_chandle == NULL)
+	_STARPU_MPI_DEBUG(60, "Looking for chandle with tag %d in the hashmap[%d]\n", mpi_tag, source);
+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &mpi_tag, hashlist);
+	if (hashlist == NULL)
 	{
-		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap, mpi_tag, chandle);
-		_STARPU_MPI_DEBUG(3, "Adding copied handle %p with tag %d in the hashmap. \n", chandle, chandle->mpi_tag);
+		chandle = NULL;
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "Error add_chandle : copied handle %p with tag %d already in the hashmap. \n", chandle, chandle->mpi_tag);
-		STARPU_ASSERT(test_chandle != NULL);
+		if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
+		{
+			chandle = NULL;
+		}
+		else
+		{
+			if (delete == 1)
+			{
+				chandle = _starpu_mpi_copy_handle_list_pop_front(hashlist->list);
+			}
+			else
+			{
+				chandle = _starpu_mpi_copy_handle_list_front(hashlist->list);
+			}
+		}
 	}
+	_STARPU_MPI_DEBUG(60, "Found chandle %p with tag %d in the hashmap[%d]\n", chandle, mpi_tag, source);
+	return chandle;
 }
 
-static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
+static struct _starpu_mpi_copy_handle *find_chandle(int mpi_tag, int source)
 {
-	struct _starpu_mpi_copy_handle *test_chandle;
+	return pop_chandle(mpi_tag, source, 0);
+}
 
-	test_chandle = find_chandle(chandle->mpi_tag);
+static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to add chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
 
-	if (test_chandle != NULL)
+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], &chandle->mpi_tag, hashlist);
+	if (hashlist == NULL)
 	{
-		HASH_DEL(_starpu_mpi_copy_handle_hashmap, chandle);
-		_STARPU_MPI_DEBUG(3, "Deleting copied handle %p with tag %d from the hashmap. \n", chandle, chandle->mpi_tag);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(3, "Warning delete_chandle : copied handle %p with tag %d isn't in the hashmap. \n", chandle, chandle->mpi_tag);
+		hashlist = malloc(sizeof(struct _starpu_mpi_copy_handle_hashlist));
+		hashlist->list = _starpu_mpi_copy_handle_list_new();
+		hashlist->mpi_tag = chandle->mpi_tag;
+		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], mpi_tag, hashlist);
 	}
+	_starpu_mpi_copy_handle_list_push_back(hashlist->list, chandle);
+	_starpu_mpi_copy_handle_hashmap_count ++;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
+#endif
+}
+
+static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
+{
+	_STARPU_MPI_DEBUG(60, "Trying to delete chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
+	struct _starpu_mpi_copy_handle *found = pop_chandle(chandle->mpi_tag, chandle->source, 1);
+
+	STARPU_ASSERT_MSG(found == chandle,
+			  "Error delete_chandle : chandle %p with tag %d is NOT in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
+
+	_starpu_mpi_copy_handle_hashmap_count --;
+#ifdef STARPU_VERBOSE
+	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
+#endif
 }
 
 static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
@@ -183,7 +250,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	/* Initialize the request structure */
 	req->data_handle = NULL;
 
-	req->datatype = NULL;
+	req->datatype = 0;
 	req->ptr = NULL;
 	req->count = -1;
 	req->user_datatype = -1;
@@ -195,7 +262,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	req->func = NULL;
 
 	req->status = NULL;
-	req->request = NULL;
+	req->request = 0;
 	req->flag = NULL;
 
 	req->ret = -1;
@@ -216,7 +283,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	req->callback = NULL;
 	req->callback_arg = NULL;
 
-	req->size_req = NULL;
+	req->size_req = 0;
 	req->internal_req = NULL;
 	req->is_internal_req = 0;
 	req->envelope = NULL;
@@ -236,7 +303,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 							       enum starpu_data_access_mode mode,
 							       int sequential_consistency,
 							       int is_internal_req,
-							       ssize_t psize)
+							       ssize_t count)
 {
 
 	 _STARPU_MPI_LOG_IN();
@@ -258,7 +325,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	 req->func = func;
 	 req->sequential_consistency = sequential_consistency;
 	 req->is_internal_req = is_internal_req;
-	 req->count = psize;
+	 req->count = count;
 
 	 /* Asynchronously request StarPU to fetch the data in main memory: when
 	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
@@ -312,11 +379,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
 	if (req->user_datatype == 0)
 	{
+		int size;
 		req->count = 1;
 		req->ptr = starpu_data_get_local_ptr(req->data_handle);
 
-		req->envelope->psize = (ssize_t)req->count;
-
+		MPI_Type_size(req->datatype, &size);
+		req->envelope->size = (ssize_t)req->count * size;
 		_STARPU_MPI_DEBUG(1, "Post MPI isend count (%ld) datatype_size %ld request to %d with tag %d\n",req->count,starpu_data_get_size(req->data_handle),req->srcdst, _starpu_mpi_tag);
 		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 	}
@@ -325,30 +393,30 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 		int ret;
 
  		// Do not pack the data, just try to find out the size
-		starpu_data_pack(req->data_handle, NULL, &(req->envelope->psize));
+		starpu_data_pack(req->data_handle, NULL, &(req->envelope->size));
 
-		if (req->envelope->psize != -1)
+		if (req->envelope->size != -1)
  		{
  			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
-			req->count = req->envelope->psize;
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", req->envelope->size, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
+			req->count = req->envelope->size;
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
  		}
 
  		// Pack the data
  		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
-		if (req->envelope->psize == -1)
+		if (req->envelope->size == -1)
  		{
  			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
  		}
  		else
  		{
  			// We check the size returned with the 2 calls to pack is the same
-			STARPU_ASSERT_MSG(req->count == req->envelope->psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->psize);
+			STARPU_ASSERT_MSG(req->count == req->envelope->size, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->size);
  		}
 		// We can send the data now
 	}
@@ -415,7 +483,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 	STARPU_ASSERT_MSG(req->ptr, "Invalid pointer to receive data");
 
-	_STARPU_MPI_DEBUG(2, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 
 	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
 
@@ -435,9 +503,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, ssize_t psize)
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, ssize_t count)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, psize);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, count);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -445,12 +513,12 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
 
-	// We check if a tag is defined for the data handle, if not,
-	// we define the one given for the communication.
-	// A tag is necessary for the internal mpi engine.
-	int tag = starpu_data_get_tag(data_handle);
-	if (tag == -1)
-		starpu_data_set_tag(data_handle, mpi_tag);
+//	// We check if a tag is defined for the data handle, if not,
+//	// we define the one given for the communication.
+//	// A tag is necessary for the internal mpi engine.
+//	int tag = starpu_data_get_tag(data_handle);
+//	if (tag == -1)
+//		starpu_data_set_tag(data_handle, mpi_tag);
 
 	struct _starpu_mpi_req *req;
 	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL, 1, 0, 0);
@@ -466,12 +534,12 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 {
 	_STARPU_MPI_LOG_IN();
 
-	// We check if a tag is defined for the data handle, if not,
-	// we define the one given for the communication.
-	// A tag is necessary for the internal mpi engine.
-	int tag = starpu_data_get_tag(data_handle);
-	if (tag == -1)
-		starpu_data_set_tag(data_handle, mpi_tag);
+//	// We check if a tag is defined for the data handle, if not,
+//	// we define the one given for the communication.
+//	// A tag is necessary for the internal mpi engine.
+//	int tag = starpu_data_get_tag(data_handle);
+//	if (tag == -1)
+//		starpu_data_set_tag(data_handle, mpi_tag);
 
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, 1, 0, 0);
 	_STARPU_MPI_LOG_OUT();
@@ -482,6 +550,13 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 {
 	_STARPU_MPI_LOG_IN();
 
+//	// We check if a tag is defined for the data handle, if not,
+//	// we define the one given for the communication.
+//	// A tag is necessary for the internal mpi engine.
+//	int tag = starpu_data_get_tag(data_handle);
+//	if (tag == -1)
+//		starpu_data_set_tag(data_handle, mpi_tag);
+
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, sequential_consistency, 0, 0);
 
 	_STARPU_MPI_LOG_OUT();
@@ -493,12 +568,12 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 	starpu_mpi_req req;
 	_STARPU_MPI_LOG_IN();
 
-	// We check if a tag is defined for the data handle, if not,
-	// we define the one given for the communication.
-	// A tag is necessary for the internal mpi engine.
-	int tag = starpu_data_get_tag(data_handle);
-	if (tag == -1)
-		starpu_data_set_tag(data_handle, mpi_tag);
+//	// We check if a tag is defined for the data handle, if not,
+//	// we define the one given for the communication.
+//	// A tag is necessary for the internal mpi engine.
+//	int tag = starpu_data_get_tag(data_handle);
+//	if (tag == -1)
+//		starpu_data_set_tag(data_handle, mpi_tag);
 
 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
 	starpu_mpi_wait(&req, status);
@@ -779,7 +854,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
 	if (req->internal_req)
 	{
-		struct _starpu_mpi_copy_handle *chandle = find_chandle(starpu_data_get_tag(req->data_handle));
+		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
+		STARPU_ASSERT_MSG(chandle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
 		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
 		delete_chandle(chandle);
 		free(chandle);
@@ -839,6 +915,7 @@ struct _starpu_mpi_copy_cb_args
 	starpu_data_handle_t data_handle;
 	starpu_data_handle_t copy_handle;
 	struct _starpu_mpi_req *req;
+	void *buffer;
 };
 
 static void _starpu_mpi_copy_cb(void* arg)
@@ -850,19 +927,30 @@ static void _starpu_mpi_copy_cb(void* arg)
 	args->req->request = args->req->internal_req->request;
 	args->req->submitted = 1;
 
-	struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
-	void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
-	void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
-
-	if (!itf->copy_methods->ram_to_ram)
+	if (args->buffer)
 	{
-		_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
-		itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
+		/* Data has been received as a raw memory, it has to be unpacked */
+		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
+		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
+		itf_dst->unpack_data(args->data_handle, 0, args->buffer, itf_src->get_size(args->copy_handle));
+		free(args->buffer);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
-		itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
+		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
+		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
+		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
+
+		if (!itf->copy_methods->ram_to_ram)
+		{
+			_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
+			itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
+		}
+		else
+		{
+			_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
+			itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
+		}
 	}
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
@@ -889,7 +977,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p tag %d and type %s\n", req, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
+	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
@@ -929,7 +1017,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 		else
 		{
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
-			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag);
+			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
 
 			/* Case : the request has already been submitted internally by StarPU.
 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
@@ -937,6 +1025,13 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			 * bring the data back to the original data handle associated to the request.*/
 			if (chandle)
 			{
+				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+				STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req_mutex));
+				while (!(chandle->req_ready))
+					STARPU_PTHREAD_COND_WAIT(&(chandle->req_cond), &(chandle->req_mutex));
+				STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req_mutex));
+				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
 				STARPU_ASSERT(req->data_handle != chandle->handle);
 
@@ -945,6 +1040,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
 				cb_args->data_handle = req->data_handle;
 				cb_args->copy_handle = chandle->handle;
+				cb_args->buffer = chandle->buffer;
 				cb_args->req = req;
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
@@ -954,6 +1050,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			 * We just add the pending receive request to the requests' hashmap. */
 			else
 			{
+				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
 				add_app_req(req);
 			}
 		}
@@ -1119,6 +1216,7 @@ static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 static void *_starpu_mpi_progress_thread_func(void *arg)
 {
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
+	int rank, worldsize;
 
 	if (argc_argv->initialize_mpi)
 	{
@@ -1137,10 +1235,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
 	}
 
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
+
 	{
-		int rank, worldsize;
-		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-		MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 		TRACE_MPI_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
 		starpu_profiling_set_id(rank);
@@ -1162,7 +1261,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0);
+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (_starpu_mpi_app_req_hashmap_count == 0);
 
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1202,7 +1301,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
 		 * requests in our side, we resubmit a header request. */
 		MPI_Request header_req;
-		if ((HASH_COUNT(_starpu_mpi_app_req_hashmap) > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
+		if ((_starpu_mpi_app_req_hashmap_count > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
 		{
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
@@ -1226,37 +1325,53 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 			if (flag)
 			{
-				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d (size %ld)\n", recv_env->mpi_tag, recv_env->psize);
+				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
 
-				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag);
+				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag, status.MPI_SOURCE);
 
 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
 				if (!found_req)
 				{
-					_STARPU_MPI_DEBUG(3, "Request with tag %d not found, creating a copy_handle to receive incoming data..\n",recv_env->mpi_tag);
+					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a copy_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 					starpu_data_handle_t data_handle = NULL;
 
-					while(!(data_handle))
-					{
-						STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-						data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
-						STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-					}
-					STARPU_ASSERT(data_handle);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+					data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
-					struct _starpu_mpi_copy_handle* chandle = malloc(sizeof(struct _starpu_mpi_copy_handle));
+					struct _starpu_mpi_copy_handle* chandle = calloc(1, sizeof(struct _starpu_mpi_copy_handle));
 					STARPU_ASSERT(chandle);
-
+					STARPU_PTHREAD_MUTEX_INIT(&chandle->req_mutex, NULL);
+					STARPU_PTHREAD_COND_INIT(&chandle->req_cond, NULL);
 					chandle->mpi_tag = recv_env->mpi_tag;
 					chandle->env = recv_env;
-					starpu_data_register_same(&chandle->handle, data_handle);
-					add_chandle(chandle);
+					chandle->source = status.MPI_SOURCE;
+
+					if (data_handle)
+					{
+						chandle->buffer = NULL;
+						starpu_data_register_same(&chandle->handle, data_handle);
+						add_chandle(chandle);
+					}
+					else
+					{
+						/* The application has not registered yet a data with the tag,
+						 * we are going to receive the data as a raw memory, and give it
+						 * to the application when it post a receive for this tag
+						 */
+						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
+						chandle->buffer = malloc(chandle->env->size);
+						starpu_vector_data_register(&chandle->handle, 0, (uintptr_t) chandle->buffer, chandle->env->size, 1);
+						add_chandle(chandle);
+					}
 
-					_STARPU_MPI_DEBUG(3, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->psize);
+					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->size);
+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 					// We wait until the request is pushed in the
 					// new_request list, that ensures that the next loop
@@ -1268,6 +1383,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					while (!(chandle->req->posted))
 					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
 					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
+
+					STARPU_PTHREAD_MUTEX_LOCK(&chandle->req_mutex);
+					chandle->req_ready = 1;
+					STARPU_PTHREAD_COND_BROADCAST(&chandle->req_cond);
+					STARPU_PTHREAD_MUTEX_UNLOCK(&chandle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
@@ -1286,7 +1406,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					}
 					else
 					{
-						found_req->count = recv_env->psize;
+						found_req->count = recv_env->size;
 						found_req->ptr = malloc(found_req->count);
 
 						STARPU_ASSERT_MSG(found_req->ptr, "cannot allocate message of size %ld\n", found_req->count);
@@ -1313,8 +1433,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
-	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0, "Number of receive requests left is not zero");
-	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0, "Number of copy requests left is not zero");
+	STARPU_ASSERT_MSG(_starpu_mpi_app_req_hashmap_count == 0, "Number of receive requests left is not zero");
+	STARPU_ASSERT_MSG(_starpu_mpi_copy_handle_hashmap_count == 0, "Number of copy requests left is not zero");
+
 	if (argc_argv->initialize_mpi)
 	{
 		_STARPU_MPI_DEBUG(3, "Calling MPI_Finalize()\n");
@@ -1323,6 +1444,21 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
+	{
+		int n;
+		struct _starpu_mpi_copy_handle_hashlist *hashlist;
+
+		for(n=0 ; n<worldsize; n++)
+		{
+			for(hashlist=_starpu_mpi_copy_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
+			{
+				_starpu_mpi_copy_handle_list_delete(hashlist->list);
+			}
+		}
+	}
+
+	free(_starpu_mpi_app_req_hashmap);
+	free(_starpu_mpi_copy_handle_hashmap);
 	free(argc_argv);
 	free(recv_env);
 
@@ -1406,6 +1542,16 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
+
+	{
+		int nb_nodes, k;
+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
+		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
+		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
+	}
+
 	return 0;
 }
 

+ 1 - 1
mpi/src/starpu_mpi_private.h

@@ -86,7 +86,7 @@ enum _starpu_mpi_request_type
 
 struct _starpu_mpi_envelope
 {
-	ssize_t psize;
+	ssize_t size;
 	int mpi_tag;
 };
 

+ 111 - 33
mpi/src/starpu_mpi_insert_task.c

@@ -22,7 +22,7 @@
 #include <starpu_data.h>
 #include <common/utils.h>
 #include <common/uthash.h>
-#include <util/starpu_insert_task_utils.h>
+#include <util/starpu_task_insert_utils.h>
 #include <datawizard/coherency.h>
 #include <core/task.h>
 
@@ -195,7 +195,7 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
 			 * The application knows we won't do anything
 			 * about this task */
 			/* Yes, the app could actually not call
-			 * insert_task at all itself, this is just a
+			 * task_insert at all itself, this is just a
 			 * safeguard. */
 			_STARPU_MPI_DEBUG(3, "oh oh\n");
 			_STARPU_MPI_LOG_OUT();
@@ -363,7 +363,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 	}
 }
 
-int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
 	int arg_type;
 	va_list varg_list;
@@ -585,7 +585,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	{
 		/* Get the number of buffers and the size of the arguments */
 		va_start(varg_list, codelet);
-		arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
+		arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
 
 		/* Pack arguments if needed */
 		if (arg_buffer_size)
@@ -596,13 +596,16 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 		_STARPU_MPI_DEBUG(1, "Execution of the codelet %p (%s)\n", codelet, codelet->name);
 		va_start(varg_list, codelet);
+
 		struct starpu_task *task = starpu_task_create();
+		task->cl_arg_free = 1;
+
 		if (codelet->nbuffers > STARPU_NMAXBUFS)
 		{
 			task->dyn_handles = malloc(codelet->nbuffers * sizeof(starpu_data_handle_t));
 		}
-		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
-		STARPU_ASSERT_MSG(ret==0, "_starpu_insert_task_create_and_submit failure %d", ret);
+		int ret = _starpu_task_insert_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
+		STARPU_ASSERT_MSG(ret==0, "_starpu_task_insert_create_and_submit failure %d", ret);
 	}
 
 	if (inconsistent_execute)
@@ -809,7 +812,60 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	}
 }
 
-/* TODO: this should rather be implicitly called by starpu_mpi_insert_task when
+struct _starpu_mpi_redux_data_args
+{
+	starpu_data_handle_t data_handle;
+	starpu_data_handle_t new_handle;
+	int tag;
+	int node;
+	MPI_Comm comm;
+	struct starpu_task *taskB;
+};
+
+void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[], STARPU_ATTRIBUTE_UNUSED void *cl_arg)
+{
+}
+
+struct starpu_codelet _starpu_mpi_redux_data_read_cl =
+{
+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.name = "_starpu_mpi_redux_data_read_cl"
+};
+
+struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
+{
+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.name = "_starpu_mpi_redux_data_write_cl"
+};
+
+void _starpu_mpi_redux_data_detached_callback(void *arg)
+{
+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
+
+	STARPU_TASK_SET_HANDLE(args->taskB, args->new_handle, 1);
+	int ret = starpu_task_submit(args->taskB);
+	STARPU_ASSERT(ret == 0);
+
+	starpu_data_unregister_submit(args->new_handle);
+}
+
+void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
+{
+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
+	starpu_data_register_same(&args->new_handle, args->data_handle);
+
+	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
+}
+
+/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
  * a data previously accessed in REDUX mode gets accessed in R mode. */
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 {
@@ -836,46 +892,68 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	// need to count how many nodes have the data in redux mode
 	if (me == rank)
 	{
-		int i;
+		int i, j=0;
+		struct starpu_task *taskBs[nb_nodes];
 
 		for(i=0 ; i<nb_nodes ; i++)
 		{
 			if (i != rank)
 			{
-				starpu_data_handle_t new_handle;
-
-				starpu_data_register_same(&new_handle, data_handle);
-
-				_STARPU_MPI_DEBUG(1, "Receiving redux handle from %d in %p ...\n", i, new_handle);
-
-				/* FIXME: we here allocate a lot of data: one
-				 * instance per MPI node and per number of
-				 * times we are called. We should rather do
-				 * that much later, e.g. after data_handle
-				 * finished its last read access, by submitting
-				 * an empty task A reading data_handle whose
-				 * callback submits the mpi comm, whose
-				 * callback submits the redux_cl task B with
-				 * sequential consistency set to 0, and submit
-				 * an empty task C writing data_handle and
-				 * depending on task B, just to replug with
-				 * implicit data dependencies with tasks
-				 * inserted after this reduction.
+				/* We need to make sure all is
+				 * executed after data_handle finished
+				 * its last read access, we hence do
+				 * the following:
+				 * - submit an empty task A reading
+				 * data_handle whose callback submits
+				 * the mpi comm with sequential
+				 * consistency set to 0, whose
+				 * callback submits the redux_cl task
+				 * B with sequential consistency set
+				 * to 0,
+				 * - submit an empty task C reading
+				 * and writing data_handle and
+				 * depending on task B, just to replug
+				 * with implicit data dependencies
+				 * with tasks inserted after this
+				 * reduction.
 				 */
-				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
-				starpu_insert_task(data_handle->redux_cl,
-						   STARPU_RW, data_handle,
-						   STARPU_R, new_handle,
+
+				/* FIXME: free args */
+				struct _starpu_mpi_redux_data_args *args = malloc(sizeof(struct _starpu_mpi_redux_data_args));
+				args->data_handle = data_handle;
+				args->tag = tag;
+				args->node = i;
+				args->comm = comm;
+
+				// We need to create taskB early as
+				// taskC declares a dependancy on it
+				args->taskB = starpu_task_create();
+				args->taskB->cl = args->data_handle->redux_cl;
+				args->taskB->sequential_consistency = 0;
+				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
+				taskBs[j] = args->taskB; j++;
+
+				// Submit taskA
+				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
+						   STARPU_R, data_handle,
+						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
 						   0);
-				starpu_data_unregister_submit(new_handle);
 			}
 		}
+
+		// Submit taskC which depends on all taskBs created
+		struct starpu_task *taskC = starpu_task_create();
+		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
+		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
+		starpu_task_declare_deps_array(taskC, j, taskBs);
+		int ret = starpu_task_submit(taskC);
+		STARPU_ASSERT(ret == 0);
 	}
 	else
 	{
 		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
-		starpu_insert_task(data_handle->init_cl, STARPU_W, data_handle, 0);
+		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 	/* FIXME: In order to prevent simultaneous receive submissions
 	 * on the same handle, we need to wait that all the starpu_mpi

mpi/src/starpu_mpi_insert_task.h → mpi/src/starpu_mpi_task_insert.h


+ 10 - 2
mpi/tests/Makefile.am

@@ -102,7 +102,9 @@ starpu_mpi_TESTS =				\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_reduction				\
-	user_defined_datatype
+	user_defined_datatype			\
+	gather					\
+	gather2
 
 noinst_PROGRAMS =				\
 	datatypes				\
@@ -130,7 +132,9 @@ noinst_PROGRAMS =				\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_reduction				\
-	user_defined_datatype
+	user_defined_datatype			\
+	gather					\
+	gather2
 
 mpi_isend_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -184,6 +188,10 @@ mpi_reduction_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 user_defined_datatype_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+gather_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+gather2_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 ring_SOURCES = ring.c
 ring_async_SOURCES = ring_async.c

+ 76 - 0
mpi/tests/gather.c

@@ -0,0 +1,76 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	starpu_data_handle_t handle;
+	int var;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size<3)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need more than 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		int n;
+		for(n=1 ; n<size ; n++)
+		{
+			MPI_Status status;
+
+			FPRINTF_MPI("receiving from node %d\n", n);
+			starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
+			starpu_mpi_recv(handle, n, 42, MPI_COMM_WORLD, &status);
+			starpu_data_acquire(handle, STARPU_R);
+			STARPU_ASSERT_MSG(var == n, "Received incorrect value <%d> from node <%d>\n", var, n);
+			FPRINTF_MPI("received <%d> from node %d\n", var, n);
+			starpu_data_release(handle);
+			starpu_data_unregister(handle);
+		}
+	}
+	else
+	{
+		FPRINTF_MPI("sending to node %d\n", 0);
+		var = rank;
+		starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
+		starpu_mpi_send(handle, 0, 42, MPI_COMM_WORLD);
+		starpu_data_unregister(handle);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return ret;
+}

+ 98 - 0
mpi/tests/gather2.c

@@ -0,0 +1,98 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size<3)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need more than 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		int n;
+		for(n=1 ; n<size ; n++)
+		{
+			int i, var[2];
+			MPI_Status status[3];
+			starpu_data_handle_t handle[2];
+
+			FPRINTF_MPI("receiving from node %d\n", n);
+			for(i=0 ; i<2 ; i++)
+				starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
+
+			starpu_mpi_recv(handle[0], n, 42, MPI_COMM_WORLD, &status[0]);
+			starpu_data_acquire(handle[0], STARPU_R);
+			STARPU_ASSERT_MSG(var[0] == n, "Received incorrect value <%d> from node <%d>\n", var[0], n);
+			FPRINTF_MPI("received <%d> from node %d\n", var[0], n);
+			starpu_data_release(handle[0]);
+
+			starpu_mpi_recv(handle[0], n, 42, MPI_COMM_WORLD, &status[1]);
+			starpu_mpi_recv(handle[1], n, 44, MPI_COMM_WORLD, &status[2]);
+			for(i=0 ; i<2 ; i++)
+				starpu_data_acquire(handle[i], STARPU_R);
+			STARPU_ASSERT_MSG(var[0] == n*2, "Received incorrect value <%d> from node <%d>\n", var[0], n);
+			STARPU_ASSERT_MSG(var[1] == n*4, "Received incorrect value <%d> from node <%d>\n", var[0], n);
+			FPRINTF_MPI("received <%d> and <%d> from node %d\n", var[0], var[1], n);
+			for(i=0 ; i<2 ; i++)
+				starpu_data_release(handle[i]);
+			for(i=0 ; i<2 ; i++)
+				starpu_data_unregister(handle[i]);
+		}
+	}
+	else
+	{
+		int i, var[3];
+		starpu_data_handle_t handle[3];
+
+		FPRINTF_MPI("sending to node %d\n", 0);
+		var[0] = rank;
+		var[1] = var[0] * 2;
+		var[2] = var[0] * 4;
+		for(i=0 ; i<3 ; i++)
+			starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
+		starpu_mpi_send(handle[0], 0, 42, MPI_COMM_WORLD);
+		starpu_mpi_send(handle[1], 0, 42, MPI_COMM_WORLD);
+		starpu_mpi_send(handle[2], 0, 44, MPI_COMM_WORLD);
+		for(i=0 ; i<3 ; i++)
+			starpu_data_unregister(handle[i]);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return ret;
+}

+ 9 - 9
mpi/tests/insert_task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -102,14 +102,14 @@ int main(int argc, char **argv)
 		}
 	}
 
-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();

+ 3 - 3
mpi/tests/insert_task_block.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -125,10 +125,10 @@ int main(int argc, char **argv)
 	{
 		for (y = 0; y < BLOCKS; y++)
 		{
-			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+			ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
 						     STARPU_RW, data_handles[x][y],
 						     0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 		}
 	}
 

+ 5 - 5
mpi/tests/insert_task_cache.c

@@ -82,14 +82,14 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
 	for(i = 0; i < 5; i++)
 	{
-		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 	}
 
 	for(i = 0; i < 5; i++)
 	{
-		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 	}
 
 	starpu_task_wait_for_all();
@@ -130,7 +130,7 @@ int main(int argc, char **argv)
 	{
 		dst = (rank == 0) ? 1 : 0;
 		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
-		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
+		FPRINTF_MPI("Communication cache mechanism is %sworking\n", result?"":"NOT ");
 	}
 	else
 		result = 1;

+ 10 - 10
mpi/tests/insert_task_owner.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -101,41 +101,41 @@ int main(int argc, char **argv)
 	}
 
 	node = starpu_data_get_rank(data_handlesx1);
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
 				     0);
 	assert(err == 0);
 
 	node = starpu_data_get_rank(data_handlesx0);
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
 				     0);
 	assert(err == 0);
 
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
 				     0);
 	assert(err == -EINVAL);
 
 	node = 1;
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 	assert(err == 0);
 
 	node = 0;
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 	assert(err == 0);
 
 	node = 0;
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
@@ -145,7 +145,7 @@ int main(int argc, char **argv)
 	   going to overwrite the node even though the data model clearly specifies
 	   which node is going to execute the codelet */
 	node = 0;
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_w,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
@@ -155,13 +155,13 @@ int main(int argc, char **argv)
 	   going to overwrite the node even though the data model clearly specifies
 	   which node is going to execute the codelet */
 	node = 0;
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_w_r,
 				     STARPU_VALUE, &node, sizeof(node),
 				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
 				     0);
 	assert(err == 0);
 
-	fprintf(stderr, "Waiting ...\n");
+	FPRINTF_MPI("Waiting ...\n");
 	starpu_task_wait_for_all();
 	starpu_data_unregister(data_handlesx0);
 	starpu_data_unregister(data_handlesx1);

+ 2 - 2
mpi/tests/insert_task_owner2.c

@@ -93,12 +93,12 @@ int main(int argc, char **argv)
 	starpu_data_set_rank(data_handles[3], 1);
 	starpu_data_set_tag(data_handles[3], 3);
 
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
 				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
 				     STARPU_W, data_handles[2],
 				     STARPU_W, data_handles[3],
 				     STARPU_EXECUTE_ON_NODE, 1, 0);
-	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_task_insert");
 	starpu_task_wait_for_all();
 
 	int *values = malloc(4 * sizeof(int *));

+ 2 - 2
mpi/tests/insert_task_owner_data.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -72,7 +72,7 @@ int main(int argc, char **argv)
 	starpu_data_set_rank(data_handles[1], 1);
 	starpu_data_set_tag(data_handles[1], 1);
 
-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
 				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
 				     STARPU_EXECUTE_ON_DATA, data_handles[1],
 				     0);

+ 3 - 3
mpi/tests/mpi_earlyrecv.c

@@ -52,12 +52,12 @@ int main(int argc, char **argv)
 
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 
-	fprintf(stderr, "rank %d exchanging with rank %d\n", rank, other_rank);
+	FPRINTF_MPI("rank %d exchanging with rank %d\n", rank, other_rank);
 
 	if (rank%2)
 	{
 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, NULL);
+		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
 		nb_requests = 2;
 	}
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 				MPI_Status status;
 				starpu_mpi_test(&request[i], &flag, &status);
 				if (flag)
-					fprintf(stderr, "request[%d] = %d %p\n", i, flag, request[i]);
+					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
 			}
 		}
 		finished = request[0] == NULL;

+ 1 - 1
mpi/tests/mpi_earlyrecv2.c

@@ -99,7 +99,7 @@ int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detac
 		{
 			for(i=0 ; i<NB ; i++)
 			{
-			     starpu_mpi_wait(&req[i], NULL);
+			     starpu_mpi_wait(&req[i], MPI_STATUS_IGNORE);
 			     func(handles[i], i, rank, &ret);
 			}
 		}

+ 8 - 6
mpi/tests/mpi_reduction.c

@@ -17,6 +17,7 @@
 
 #include <starpu_mpi.h>
 #include <math.h>
+#include "helper.h"
 
 extern void init_cpu_func(void *descr[], void *cl_arg);
 extern void redux_cpu_func(void *descr[], void *cl_arg);
@@ -111,6 +112,7 @@ int main(int argc, char **argv)
 	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
 	for(x = 0; x < nb_elements; x+=step)
 	{
+		handles[x] = NULL;
 		int mpi_rank = my_distrib(x/step, size);
 		if (mpi_rank == my_rank)
 		{
@@ -136,17 +138,17 @@ int main(int argc, char **argv)
 	{
 		for (x = 0; x < nb_elements; x+=step)
 		{
-			starpu_mpi_insert_task(MPI_COMM_WORLD,
+			starpu_mpi_task_insert(MPI_COMM_WORLD,
 					       &dot_codelet,
 					       STARPU_R, handles[x],
 					       STARPU_REDUX, dot_handle,
 					       0);
 		}
 		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
-		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
 	}
 
-	fprintf(stderr, "Waiting ...\n");
+	FPRINTF_MPI("Waiting ...\n");
 	starpu_task_wait_for_all();
 
 	for(x = 0; x < nb_elements; x+=step)
@@ -165,9 +167,9 @@ int main(int argc, char **argv)
 
 	if (my_rank == 0)
 	{
-		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
-		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
-		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
+		FPRINTF(stderr, "[%d] sum=%ld\n", my_rank, sum);
+		FPRINTF(stderr, "[%d] dot=%ld\n", my_rank, dot);
+		FPRINTF(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
 	}
 
 	return 0;

+ 7 - 10
mpi/tests/mpi_reduction_kernels.c

@@ -17,10 +17,7 @@
 #include <starpu.h>
 #include <mpi.h>
 
-#define _DISPLAY(fmt, ...) do { \
-		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
-		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __starpu_func__ ,## __VA_ARGS__); 	\
-		fflush(stderr); } while(0)
+#include "helper.h"
 
 /*
  *	Codelet to create a neutral element
@@ -29,7 +26,7 @@ void init_cpu_func(void *descr[], void *cl_arg)
 {
 	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dot = 0;
-	_DISPLAY("Init dot\n");
+	FPRINTF_MPI("Init dot\n");
 }
 
 /*
@@ -41,7 +38,7 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	*dota = *dota + *dotb;
-	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
+	FPRINTF_MPI("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
 }
 
 /*
@@ -54,14 +51,14 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 
 	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
+	//FPRINTF_MPI("Before dot=%ld (adding %d elements...)\n", *dot, n);
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
-//		_DISPLAY("Adding %ld\n", local_x[i]);
+		//FPRINTF_MPI("Adding %ld\n", local_x[i]);
 		*dot += local_x[i];
 	}
-//	_DISPLAY("After dot=%ld\n", *dot);
+	//FPRINTF_MPI("After dot=%ld\n", *dot);
 }
 
 /*
@@ -71,6 +68,6 @@ void display_cpu_func(void *descr[], void *cl_arg)
 {
 	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	_DISPLAY("Local=%ld\n", *local_x);
+	FPRINTF_MPI("Local=%ld\n", *local_x);
 }
 

+ 2 - 2
mpi/tests/mpi_redux.c

@@ -26,7 +26,7 @@ void callback(void *arg)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	*received = *received + 1;
-	fprintf(stderr, "received = %d\n", *received);
+	FPRINTF_MPI("received = %d\n", *received);
 	STARPU_PTHREAD_COND_SIGNAL(&cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 }
@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 		starpu_data_unregister_submit(handles[0]);
 
 		starpu_variable_data_register(&handles[0], STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(int));
-		starpu_mpi_recv(handles[0], 0, 12+rank, MPI_COMM_WORLD, NULL);
+		starpu_mpi_recv(handles[0], 0, 12+rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 		starpu_data_unregister(handles[0]);
 	}
 

+ 7 - 6
mpi/tests/mpi_scatter_gather.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,6 +15,7 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
@@ -35,12 +36,12 @@ void cpu_codelet(void *descr[], void *_args)
 	starpu_codelet_unpack_args(_args, &rank);
 	factor = block[0];
 
-	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
+	//FPRINTF_MPI("rank %d factor %f\n", rank, factor);
 	for (j = 0; j < nx; j++)
 	{
 		for (i = 0; i < nx; i++)
 		{
-			//fprintf(stderr,"rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
+			//FPRINTF_MPI("rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
 			block[j+i*ld] *= factor;
 		}
 	}
@@ -56,13 +57,13 @@ static struct starpu_codelet cl =
 void scallback(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	char *msg = arg;
-	fprintf(stderr, "Sending completed for <%s>\n", msg);
+	FPRINTF_MPI("Sending completed for <%s>\n", msg);
 }
 
 void rcallback(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 	char *msg = arg;
-	fprintf(stderr, "Reception completed for <%s>\n", msg);
+	FPRINTF_MPI("Reception completed for <%s>\n", msg);
 }
 
 int main(int argc, char **argv)
@@ -177,7 +178,7 @@ int main(int argc, char **argv)
 			if (owner == rank)
 			{
 				//fprintf(stderr,"[%d] Computing on data[%d]\n", rank, x);
-				starpu_insert_task(&cl,
+				starpu_task_insert(&cl,
 						   STARPU_VALUE, &rank, sizeof(rank),
 						   STARPU_RW, data_handles[x],
 						   0);

+ 6 - 5
mpi/tests/user_defined_datatype.c

@@ -18,6 +18,7 @@
 #include <interface/complex_interface.h>
 #include <interface/complex_codelet.h>
 #include <user_defined_datatype_value.h>
+#include "helper.h"
 
 #ifdef STARPU_QUICK_CHECK
 #  define ELEMENTS 10
@@ -100,7 +101,7 @@ int main(int argc, char **argv)
 			float foo_compare=42.0;
 			int value_compare=36;
 
-			fprintf(stderr, "\nTesting with function %p\n", f);
+			FPRINTF_MPI("\nTesting with function %p\n", f);
 
 			if (rank == 0)
 			{
@@ -153,13 +154,13 @@ int main(int argc, char **argv)
 					compare = (foo[i] == foo_compare);
 					if (compare == 0)
 					{
-						fprintf(stderr, "ERROR. foo[%d] == %f != %f\n", i, foo[i], foo_compare);
+						FPRINTF_MPI("ERROR. foo[%d] == %f != %f\n", i, foo[i], foo_compare);
 						goto end;
 					}
 					compare = (values[i] == value_compare);
 					if (compare == 0)
 					{
-						fprintf(stderr, "ERROR. value[%d] == %d != %d\n", i, values[i], value_compare);
+						FPRINTF_MPI("ERROR. value[%d] == %d != %d\n", i, values[i], value_compare);
 						goto end;
 					}
 					for(j=0 ; j<2 ; j++)
@@ -167,7 +168,7 @@ int main(int argc, char **argv)
 						compare = (real[i][j] == real_compare[j]);
 						if (compare == 0)
 						{
-							fprintf(stderr, "ERROR. real[%d][%d] == %f != %f\n", i, j, real[i][j], real_compare[j]);
+							FPRINTF_MPI("ERROR. real[%d][%d] == %f != %f\n", i, j, real[i][j], real_compare[j]);
 							goto end;
 						}
 					}
@@ -176,7 +177,7 @@ int main(int argc, char **argv)
 						compare = (imaginary[i][j] == imaginary_compare[j]);
 						if (compare == 0)
 						{
-							fprintf(stderr, "ERROR. imaginary[%d][%d] == %f != %f\n", i, j, imaginary[i][j], imaginary_compare[j]);
+							FPRINTF_MPI("ERROR. imaginary[%d][%d] == %f != %f\n", i, j, imaginary[i][j], imaginary_compare[j]);
 							goto end;
 						}
 					}

+ 14 - 14
sc_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -96,17 +96,17 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 		if(k == 0 && with_ctxs)
 		{
-			 ret = starpu_insert_task(&cl11,
-					   STARPU_PRIORITY, prio_level,
-					   STARPU_RW, sdatakk,
-					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
-					   STARPU_HYPERVISOR_TAG, hypervisor_tag,
-					   0);
+			 ret = starpu_task_insert(&cl11,
+						  STARPU_PRIORITY, prio_level,
+						  STARPU_RW, sdatakk,
+						  STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+						  STARPU_HYPERVISOR_TAG, hypervisor_tag,
+						  0);
 			set_hypervisor_conf(START_BENCH, hypervisor_tag++);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 		else
-			starpu_insert_task(&cl11,
+			starpu_task_insert(&cl11,
 					   STARPU_PRIORITY, prio_level,
 					   STARPU_RW, sdatakk,
 					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
@@ -116,12 +116,12 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		{
                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        ret = starpu_insert_task(&cl21,
+                        ret = starpu_task_insert(&cl21,
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -133,25 +133,25 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					if(k == (nblocks-2) && j == (nblocks-1) &&
 					   i == (k + 1) && with_ctxs)
 					{
-						ret = starpu_insert_task(&cl22,
+						ret = starpu_task_insert(&cl22,
 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								   STARPU_R, sdataki,
 								   STARPU_R, sdatakj,
 								   STARPU_RW, sdataij,
 								   STARPU_HYPERVISOR_TAG, hypervisor_tag,
 								   0);
-						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 						set_hypervisor_conf(END_BENCH, hypervisor_tag++);
 					}
 					
 					else
-						ret = starpu_insert_task(&cl22,
+						ret = starpu_task_insert(&cl22,
 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								   STARPU_R, sdataki,
 								   STARPU_R, sdatakj,
 								   STARPU_RW, sdataij,
 								   0);
-						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 					
                    }
 			}

+ 2 - 0
sc_hypervisor/include/sc_hypervisor.h

@@ -128,6 +128,8 @@ void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total
 /* change dynamically the number of the elapsed flops in a context, modify the past in order to better compute the speed */
 void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops);
 
+/* updates the min and max workers needed by each context */
+	void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs);
 #ifdef __cplusplus
 }
 #endif

+ 1 - 1
sc_hypervisor/include/sc_hypervisor_lp.h

@@ -65,7 +65,7 @@ unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw
 /* linear program that returns 1/tmax, and computes in table res the nr of workers needed by each context st 
    the system ends up in the smallest tmax*/
 double sc_hypervisor_lp_simulate_distrib_flops(int nsched_ctxs, int ntypes_of_workers, double speed[nsched_ctxs][ntypes_of_workers], 
-					       double flops[nsched_ctxs], double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
+					       double flops[nsched_ctxs], double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers], unsigned sched_ctxs[nsched_ctxs]);
 
 /* linear program that simulates a distribution of tasks that minimises the execution time of the tasks in the pool */
 double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt],

+ 1 - 0
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -73,6 +73,7 @@ static void feft_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx,
 				_try_resizing(NULL, -1, NULL, -1);
 			}
 		}
+	
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 

+ 72 - 9
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -249,14 +249,14 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 	return res;
 }
 
-double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
+double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw], unsigned sched_ctxs[ns])
 {
 	int integer = 1;
 	int s, w;
 	glp_prob *lp;
 
-	int ne =
-		(ns*nw+1)*(ns+nw)
+	int ne = //ns * (nw*ns + 1) +
+		(ns*nw+1)*(2*ns+nw)
 		+ 1; /* glp dumbness */
 	int n = 1;
 	int ia[ne], ja[ne];
@@ -338,6 +338,69 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 		n++;
 	}
 
+	/* one row corresponds to one ctx*/
+	glp_add_rows(lp, ns);
+
+	for(s = 0; s < ns; s++)
+	{
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
+		char name[32];
+		snprintf(name, sizeof(name), "ctx%d", s);
+		glp_set_row_name(lp, ns+s+1, name);
+		glp_set_row_bnds(lp, ns+s+1, GLP_LO, 0., 0.);
+		
+
+		int s2;
+		for(s2 = 0; s2 < ns; s2++)
+		{
+			if(s2 == s)
+			{
+
+				for(w = 0; w < nw; w++)
+				{
+					/* only for CPUs for now */
+					if(w == 0)
+					{
+						ia[n] = ns+s+1;
+						ja[n] = w+s2*nw + 1;
+						ar[n] = 1.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+					}
+					else
+					{
+						ia[n] = ns+s+1;
+						ja[n] = w+s2*nw + 1;
+						ar[n] = 0.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+
+					}
+					n++;
+				}
+			}
+			else
+			{
+				for(w = 0; w < nw; w++)
+				{
+
+					ia[n] = ns+s+1;
+					ja[n] = w+s2*nw + 1;
+					ar[n] = 0.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+					n++;
+				}
+				
+			}
+				
+		}
+		ia[n] = ns+s+1;
+		ja[n] = ns*nw+1;
+		ar[n] = 0.0;
+		n++;
+		
+		glp_set_row_bnds(lp, ns+s+1, GLP_UP, config->min_nworkers, config->max_nworkers);
+
+	}
+
 	/*we add another linear constraint : sum(all cpus) = 9 and sum(all gpus) = 3 */
 	glp_add_rows(lp, nw);
 
@@ -345,7 +408,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 	{
 		char name[32];
 		snprintf(name, sizeof(name), "w%d", w);
-		glp_set_row_name(lp, ns+w+1, name);
+		glp_set_row_name(lp, 2*ns+w+1, name);
 		for(s = 0; s < ns; s++)
 		{
 			int w2;
@@ -353,14 +416,14 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 			{
 				if(w2 == w)
 				{
-					ia[n] = ns+w+1;
+					ia[n] = 2*ns+w+1;
 					ja[n] = w2+s*nw + 1;
 					ar[n] = 1.0;
 //					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
 				}
 				else
 				{
-					ia[n] = ns+w+1;
+					ia[n] = 2*ns+w+1;
 					ja[n] = w2+s*nw + 1;
 					ar[n] = 0.0;
 //					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
@@ -369,7 +432,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 			}
 		}
 		/* 1/tmax */
-		ia[n] = ns+w+1;
+		ia[n] = 2*ns+w+1;
 		ja[n] = ns*nw+1;
 		ar[n] = 0.0;
 //		printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
@@ -377,11 +440,11 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
 		/*sum(all gpus) = 3*/
 		if(w == 0)
-			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[0], total_nw[0]);
+			glp_set_row_bnds(lp, 2*ns+w+1, GLP_FX, total_nw[0], total_nw[0]);
 
 		/*sum(all cpus) = 9*/
 		if(w == 1)
-			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[1], total_nw[1]);
+			glp_set_row_bnds(lp, 2*ns+w+1, GLP_FX, total_nw[1], total_nw[1]);
 	}
 
 	STARPU_ASSERT(n == ne);

+ 5 - 2
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -28,6 +28,8 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 	double v[nsched_ctxs][ntypes_of_workers];
 	double flops[nsched_ctxs];
 
+	sc_hypervisor_update_resize_interval(sched_ctxs, nsched_ctxs);
+
 	int nw = tw->nw;
 	int i = 0;
 	struct sc_hypervisor_wrapper* sc_w;
@@ -59,8 +61,9 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 /* 		printf("%d: flops %lf remaining flops %lf ready flops %lf nready_tasks %d\n", */
 /* 		       sched_ctxs[i], flops[i], sc_w->remaining_flops/1000000000, sc_w->ready_flops/1000000000, sc_w->nready_tasks); */
 	}
-
-	double vmax = 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
+		
+	
+	double vmax = 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw, sched_ctxs);
 	double optimal_v = 0.0;
 	for(i = 0; i < nsched_ctxs; i++)
 	{

+ 1 - 0
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -465,6 +465,7 @@ unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker)
 	struct sc_hypervisor_policy_config *config = sc_w->config;
 	if(config != NULL)
 	{
+		printf("w%d/ctx%d: current idle %lf max_idle %lf\n", worker, sched_ctx, sc_w->current_idle_time[worker], config->max_idle[worker]);
 		if(sc_w->current_idle_time[worker] > config->max_idle[worker])
 		{
 			sc_w->current_idle_time[worker] = 0.0;

+ 1 - 1
sc_hypervisor/src/sc_config.c

@@ -74,7 +74,7 @@ void _add_config(unsigned sched_ctx)
 {
 	struct sc_hypervisor_policy_config *config = _create_config();
 	config->min_nworkers = 0;
-	config->max_nworkers = STARPU_NMAXWORKERS;
+	config->max_nworkers = starpu_worker_get_count();
 	config->new_workers_max_idle = MAX_IDLE_TIME;
 
 	int i;

+ 149 - 2
sc_hypervisor/src/sc_hypervisor.c

@@ -458,6 +458,7 @@ static void _reset_idle_time(unsigned sched_ctx)
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
 	{
 		hypervisor.sched_ctx_w[sched_ctx].idle_time[i] = 0.0;
+		hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] = hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] != 0.0 ? starpu_timing_now() : 0.0;
 	}
 	return;
 }
@@ -473,6 +474,12 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 		sender_sc_w->start_time = start_time;
 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
 		_reset_idle_time(sender_sched_ctx);
+		int i;
+		for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		{
+			sender_sc_w->idle_start_time[i] = 0.0;
+		}
+		
 	}
 
 	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
@@ -483,6 +490,12 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 		receiver_sc_w->start_time = start_time;
 		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
 		_reset_idle_time(receiver_sched_ctx);
+		int i;
+		for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		{
+			receiver_sc_w->idle_start_time[i] = (hypervisor.sched_ctx_w[receiver_sched_ctx].idle_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
+		}
+
 	}
 }
 
@@ -763,6 +776,115 @@ void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *work
 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
 }
 
+void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
+{
+	unsigned sched_ctx;
+	int total_max_nworkers = 0;
+	int max_cpus = starpu_cpu_worker_get_count();
+	double max_workers_idle_time[nsched_ctxs];
+	unsigned configured = 0;
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		sched_ctx = sched_ctxs[i];
+
+		if(hypervisor.sched_ctx_w[sched_ctx].to_be_sized) continue;
+
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
+		struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
+		int worker;
+		
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+		
+		max_workers_idle_time[i] = 0.0;
+		while(workers->has_next(workers, &it))
+		{
+			worker = workers->get_next(workers, &it);
+			if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]==0.0)
+			{
+				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
+			}
+			else
+			{
+				double end_time  = starpu_timing_now();
+				double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
+				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
+			}				
+		}			
+
+		
+		double curr_time = starpu_timing_now();
+		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
+		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
+
+		config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time) + hypervisor.sched_ctx_w[sched_ctx].nready_tasks;
+		
+		if(config->max_nworkers < 0)
+			config->max_nworkers = 0;
+		if(config->max_nworkers > max_cpus)
+			config->max_nworkers = max_cpus;
+		
+		printf("%d: ready tasks  %d idle for long %lf norm_idle_time %lf elapsed_time %lf nworkers %d max %d \n", 
+		       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers);
+
+/* 		if(max_workers_idle_time[i] > 0.000002) */
+/* 		{ */
+/* 			double curr_time = starpu_timing_now(); */
+/* 			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
+/* 			double norm_idle_time = max_workers_idle_time[i] / elapsed_time; */
+
+/* 			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time); */
+/* 			if(config->max_nworkers < 0) */
+/* 				config->max_nworkers = 0; */
+			
+/* 			printf("%d: ready tasks  %d idle for long %lf norm_idle_time %lf elapsed_time %lf nworkers %d decr %d \n",  */
+/* 			       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers); */
+
+/* 		} */
+/* 		else */
+/* 		{ */
+/* 			double curr_time = starpu_timing_now(); */
+/* 			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
+/* 			double norm_idle_time = max_workers_idle_time[i] / elapsed_time; */
+			
+/* 			if(workers->nworkers == 0 && hypervisor.sched_ctx_w[sched_ctx].nready_tasks == 1) */
+/* 				config->max_nworkers = 0; */
+/* 			else */
+/* 			{ */
+/* 				config->max_nworkers = (hypervisor.sched_ctx_w[sched_ctx].nready_tasks > max_cpus)  */
+/* 					? max_cpus : hypervisor.sched_ctx_w[sched_ctx].nready_tasks; */
+/* 				config->max_nworkers = workers->nworkers > config->max_nworkers ? workers->nworkers : config->max_nworkers; */
+/* 			} */
+/* 			printf("%d: ready tasks  %d not idle %lf norm_idle_time %lf elapsed_time %lf nworkers %d incr %d \n",  */
+/* 			       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers); */
+/* 		} */
+
+		total_max_nworkers += config->max_nworkers;
+		configured = 1;
+	}
+
+	/*if the sum of the max cpus is smaller than the total cpus available 
+	  increase the max for the ones having more ready tasks to exec */
+	if(configured && total_max_nworkers < max_cpus)
+	{
+		int diff = max_cpus - total_max_nworkers;
+		int max_nready = -1;
+		unsigned max_nready_sched_ctx = sched_ctxs[0];
+		for(i = 0; i < nsched_ctxs; i++)
+		{
+			if(max_nready < hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks)
+			{
+				max_nready = hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks;
+				max_nready_sched_ctx = sched_ctxs[i];
+			}
+		}
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(max_nready_sched_ctx);
+		config->max_nworkers += diff;
+		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
+	}
+}
 /* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
 static void notify_idle_end(unsigned sched_ctx, int worker)
 {
@@ -777,7 +899,7 @@ static void notify_idle_end(unsigned sched_ctx, int worker)
 		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
 		sc_w->idle_start_time[worker] = 0.0;
 	}
-
+			
 	if(hypervisor.policy.handle_idle_end)
 		hypervisor.policy.handle_idle_end(sched_ctx, worker);
 
@@ -793,7 +915,7 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 
 		if(sc_w->idle_start_time[worker] == 0.0)
 			sc_w->idle_start_time[worker] = starpu_timing_now();
-
+		
 		if(hypervisor.policy.handle_idle_cycle)
 		{
 			hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
@@ -842,6 +964,31 @@ static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task
 		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
+/* 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx); */
+	
+/* 	unsigned finished_sample = 0; */
+/* 	char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA"); */
+/* 	if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0)) */
+/* 	{ */
+
+/* 		double curr_time = starpu_timing_now(); */
+/* 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
+
+/* 		finished_sample = elapsed_time > config->time_sample; */
+/* 	} */
+/* 	else */
+/* 	{ */
+/* 		double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]); */
+/* 		double ctx_sample = config->ispeed_ctx_sample; */
+
+/* 		finished_sample = ctx_elapsed_flops > ctx_sample; */
+/* 	} */
+
+/* 	if(finished_sample) */
+/* 	{ */
+/* 		sc_hypervisor_update_resize_interval(sched_ctx); */
+/* 	} */
+	
 	if(hypervisor.resize[sched_ctx])
 	{	
 		if(hypervisor.policy.handle_poped_task)

+ 3 - 3
src/Makefile.am

@@ -126,7 +126,7 @@ noinst_HEADERS = 						\
 	debug/traces/starpu_fxt.h				\
 	profiling/bound.h					\
 	profiling/profiling.h					\
-	util/starpu_insert_task_utils.h				\
+	util/starpu_task_insert_utils.h				\
 	util/starpu_data_cpy.h					\
 	util/starpu_task_list_inline.h				\
 	starpu_parameters.h					\
@@ -223,8 +223,8 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	util/file.c						\
 	util/misc.c						\
 	util/starpu_data_cpy.c					\
-	util/starpu_insert_task.c				\
-	util/starpu_insert_task_utils.c				\
+	util/starpu_task_insert.c				\
+	util/starpu_task_insert_utils.c				\
 	util/starpu_inlines.c					\
 	debug/traces/starpu_fxt.c				\
 	debug/traces/starpu_fxt_mpi.c				\

+ 157 - 91
src/common/fxt.h

@@ -400,11 +400,11 @@ do {										\
 /* We skip these events becasue they are called so often that they cause FxT to
  * fail and make the overall trace unreadable anyway. */
 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
-	do {} while (0);
+	do {} while (0)
 //	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
-	do {} while (0);
+	do {} while (0)
 	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, _starpu_gettid());
 	
 #define _STARPU_TRACE_USER_EVENT(code)			\
@@ -418,80 +418,146 @@ do {										\
 
 #ifdef STARPU_FXT_LOCK_TRACES 
 
-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)	\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_MUTEX,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_LOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_MUTEX,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_UNLOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_MUTEX,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RDLOCKING_RWLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_RDLOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_WRLOCKING_RWLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_WRLOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_RWLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_UNLOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_SPINLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_LOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_SPINLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_UNLOCKED,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_SPINLOCK,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_BEGIN,line,_starpu_gettid(),file);
-
-#define _STARPU_TRACE_COND_WAIT_END(file,line)		\
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_END,line,_starpu_gettid(),file);
+#define _STARPU_TRACE_LOCKING_MUTEX()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
+} while (0)
+
+#define _STARPU_TRACE_MUTEX_LOCKED()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_LOCKED,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_UNLOCKING_MUTEX()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_MUTEX_UNLOCKED()	do {\
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_UNLOCKED,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_TRYLOCK_MUTEX()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_MUTEX,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_RDLOCKING_RWLOCK()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RDLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_RWLOCK_RDLOCKED()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_RDLOCKED,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_WRLOCKING_RWLOCK()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_WRLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_RWLOCK_WRLOCKED()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_WRLOCKED,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_UNLOCKING_RWLOCK()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_RWLOCK_UNLOCKED()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define STARPU_TRACE_SPINLOCK_CONDITITION (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER)
+
+#define _STARPU_TRACE_LOCKING_SPINLOCK()	do {\
+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
+		const char *file; \
+		file = strrchr(__FILE__,'/') + 1; \
+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_SPINLOCK,__LINE__,_starpu_gettid(),file); \
+	} \
+} while(0)
+
+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do { \
+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
+		const char *file; \
+		file = strrchr(__FILE__,'/') + 1; \
+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_LOCKED,__LINE__,_starpu_gettid(),file); \
+	} \
+} while(0)
+
+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do { \
+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
+		const char *file; \
+		file = strrchr(__FILE__,'/') + 1; \
+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_SPINLOCK,__LINE__,_starpu_gettid(),file); \
+	} \
+} while(0)
+
+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()	do { \
+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
+		const char *file; \
+		file = strrchr(__FILE__,'/') + 1; \
+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
+	} \
+} while(0)
+
+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()	do { \
+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
+		const char *file; \
+		file = strrchr(__FILE__,'/') + 1; \
+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_SPINLOCK,__LINE__,_starpu_gettid(),file); \
+	} \
+} while(0)
+
+#define _STARPU_TRACE_COND_WAIT_BEGIN()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
+} while(0)
+
+#define _STARPU_TRACE_COND_WAIT_END()	do { \
+	const char *file; \
+	file = strrchr(__FILE__,'/') + 1; \
+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_END,__LINE__,_starpu_gettid(),file); \
+} while(0)
 
 #else // !STARPU_FXT_LOCK_TRACES
 
-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)			do {} while(0)
-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		do {} while(0)
-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			do {} while(0)
-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	do {} while(0)
-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		do {} while(0)
-#define _STARPU_TRACE_COND_WAIT_END(file,line)			do {} while(0)
+#define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
+#define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
+#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
+#define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
+#define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_LOCKING_SPINLOCK()		do {} while(0)
+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do {} while(0)
+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()		do {} while(0)
+#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
+#define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
 
 #endif // STARPU_FXT_LOCK_TRACES
 
@@ -544,24 +610,24 @@ do {										\
 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)			do {} while(0)
-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		do {} while(0)
-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			do {} while(0)
-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	do {} while(0)
-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		do {} while(0)
-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		do {} while(0)
-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		do {} while(0)
-#define _STARPU_TRACE_COND_WAIT_END(file,line)			do {} while(0)
+#define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
+#define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
+#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
+#define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
+#define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
+#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_LOCKING_SPINLOCK()		do {} while(0)
+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do {} while(0)
+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do {} while(0)
+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()		do {} while(0)
+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()		do {} while(0)
+#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
+#define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
 
 #endif // STARPU_USE_FXT

+ 8 - 34
src/common/starpu_spinlock.h

@@ -54,55 +54,29 @@ int _starpu_spin_destroy(struct _starpu_spinlock *lock);
 
 int _starpu_spin_lock(struct _starpu_spinlock *lock);
 #define _starpu_spin_lock(lock) ({ \
-	const char *file;   \
-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
-	{ \
-		file = strrchr(__FILE__,'/'); \
-		file += sizeof(char);\
-		_STARPU_TRACE_LOCKING_SPINLOCK(file,__LINE__); \
-	}\
+	_STARPU_TRACE_LOCKING_SPINLOCK(); \
 	_starpu_spin_lock(lock); \
-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
-	{ \
-		file = strrchr(__FILE__,'/'); \
-		file += sizeof(char);\
-		_STARPU_TRACE_SPINLOCK_LOCKED(file,__LINE__); \
-	}\
+	_STARPU_TRACE_SPINLOCK_LOCKED(); \
 	STARPU_RECORD_LOCK(lock); \
 	0; \
 }) 
 
 int _starpu_spin_trylock(struct _starpu_spinlock *lock);
 #define _starpu_spin_trylock(lock) ({ \
-	const char *file;   \
-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
-	{ \
-		file = strrchr(__FILE__,'/'); \
-		file += sizeof(char);\
-		_STARPU_TRACE_TRYLOCK_SPINLOCK(file,__LINE__); \
-	}\
+	_STARPU_TRACE_TRYLOCK_SPINLOCK(); \
 	int err = _starpu_spin_trylock(lock); \
-	if (!err) \
+	if (!err) { \
 		STARPU_RECORD_LOCK(lock); \
+		_STARPU_TRACE_SPINLOCK_LOCKED(); \
+	} \
 	err; \
 })
 int _starpu_spin_checklocked(struct _starpu_spinlock *lock);
 int _starpu_spin_unlock(struct _starpu_spinlock *lock);
 #define _starpu_spin_unlock(lock) ({ \
-	const char *file;   \
-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
-	{ \
-		file = strrchr(__FILE__,'/'); \
-		file += sizeof(char);\
-		_STARPU_TRACE_UNLOCKING_SPINLOCK(file,__LINE__); \
-	}\
+	_STARPU_TRACE_UNLOCKING_SPINLOCK(); \
 	_starpu_spin_unlock(lock); \
-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
-	{ \
-		file = strrchr(__FILE__,'/'); \
-		file += sizeof(char);\
-		_STARPU_TRACE_SPINLOCK_UNLOCKED(file,__LINE__); \
-	}\
+	_STARPU_TRACE_SPINLOCK_UNLOCKED(); \
 	0; \
 }) 
 

+ 36 - 70
src/common/thread.c

@@ -84,43 +84,36 @@ int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
 
 int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-
-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
+	_STARPU_TRACE_LOCKING_MUTEX();
 
 	if (!*mutex) STARPU_PTHREAD_MUTEX_INIT(mutex, NULL);
 
 	xbt_mutex_acquire(*mutex);
 
-	_STARPU_TRACE_MUTEX_LOCKED(file,__LINE__);
+	_STARPU_TRACE_MUTEX_LOCKED();
 
 	return 0;
 }
 
 int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_UNLOCKING_MUTEX(file,__LINE__);
+	_STARPU_TRACE_UNLOCKING_MUTEX();
 
 	xbt_mutex_release(*mutex);
 
-	_STARPU_TRACE_MUTEX_UNLOCKED(file,__LINE__);
+	_STARPU_TRACE_MUTEX_UNLOCKED();
 
 	return 0;
 }
 
 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_TRYLOCK_MUTEX(file,__LINE__);
+	_STARPU_TRACE_TRYLOCK_MUTEX();
 
 	xbt_mutex_acquire(*mutex);
+
+	_STARPU_TRACE_MUTEX_LOCKED();
+
 	return 0;
 }
 
@@ -185,16 +178,13 @@ int starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
 
 int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_COND_WAIT_BEGIN(file,__LINE__);
+	_STARPU_TRACE_COND_WAIT_BEGIN();
 
 	if (!*cond)
 		STARPU_PTHREAD_COND_INIT(cond, NULL);
 	xbt_cond_wait(*cond, *mutex);
 
-	_STARPU_TRACE_COND_WAIT_END(file,__LINE__);
+	_STARPU_TRACE_COND_WAIT_END();
 
 	return 0;
 }
@@ -218,42 +208,33 @@ int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
 
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_RDLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_RDLOCKING_RWLOCK();
 
  	int p_ret = starpu_pthread_mutex_lock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_RDLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_RDLOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_WRLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_WRLOCKING_RWLOCK();
 
  	int p_ret = starpu_pthread_mutex_lock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_WRLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_WRLOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_UNLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_UNLOCKING_RWLOCK();
 
  	int p_ret = starpu_pthread_mutex_unlock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_UNLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_UNLOCKED();
 
 	return p_ret;
 }
@@ -262,94 +243,79 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
 int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
+	_STARPU_TRACE_LOCKING_MUTEX();
 
 	int p_ret = pthread_mutex_lock(mutex);
 
-	_STARPU_TRACE_MUTEX_LOCKED(file,__LINE__);
+	_STARPU_TRACE_MUTEX_LOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_UNLOCKING_MUTEX(file,__LINE__);
+	_STARPU_TRACE_UNLOCKING_MUTEX();
 
 	int p_ret = pthread_mutex_unlock(mutex);
 
-	_STARPU_TRACE_MUTEX_UNLOCKED(file,__LINE__);
+	_STARPU_TRACE_MUTEX_UNLOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 {
-	const char *file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
+	int ret;
+	_STARPU_TRACE_TRYLOCK_MUTEX();
+
+	ret = pthread_mutex_trylock(mutex);
+
+	if (!ret)
+		_STARPU_TRACE_MUTEX_LOCKED();
 
-	return pthread_mutex_trylock(mutex);
+	return ret;
 }
 
 int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_COND_WAIT_BEGIN(file,__LINE__);
+	_STARPU_TRACE_COND_WAIT_BEGIN();
 
  	int p_ret = pthread_cond_wait(cond, mutex);
 
-	_STARPU_TRACE_COND_WAIT_END(file,__LINE__);
+	_STARPU_TRACE_COND_WAIT_END();
 
 	return p_ret;
 }
 
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_RDLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_RDLOCKING_RWLOCK();
 
  	int p_ret = pthread_rwlock_rdlock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_RDLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_RDLOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_WRLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_WRLOCKING_RWLOCK();
 
  	int p_ret = pthread_rwlock_wrlock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_WRLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_WRLOCKED();
 
 	return p_ret;
 }
 
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 {
-	const char* file;
-	file = strrchr(__FILE__,'/');
-	file += sizeof(char);
-	_STARPU_TRACE_UNLOCKING_RWLOCK(file,__LINE__);
+	_STARPU_TRACE_UNLOCKING_RWLOCK();
 
  	int p_ret = pthread_rwlock_unlock(rwlock);
 
-	_STARPU_TRACE_RWLOCK_UNLOCKED(file,__LINE__);
+	_STARPU_TRACE_RWLOCK_UNLOCKED();
 
 	return p_ret;
 }

+ 22 - 29
src/common/utils.h

@@ -25,23 +25,14 @@
 #include <stdlib.h>
 #include <math.h>
 #include <pthread.h>
+#ifdef STARPU_HAVE_SCHED_YIELD
+#include <sched.h>
+#endif
 
 #ifdef STARPU_HAVE_HELGRIND_H
 #include <valgrind/helgrind.h>
 #endif
 
-#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
-#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
-#endif
-#ifndef VALGRIND_HG_MUTEX_LOCK_POST
-#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
-#endif
-#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
-#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
-#endif
-#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
-#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
-#endif
 #ifndef DO_CREQ_v_WW
 #define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
 #endif
@@ -54,25 +45,14 @@
 #ifndef ANNOTATE_HAPPENS_AFTER
 #define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
 #endif
-#ifndef ANNOTATE_RWLOCK_ACQUIRED
-#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
+#ifndef VALGRIND_HG_DISABLE_CHECKING
+#define VALGRIND_HG_DISABLE_CHECKING(start, len) ((void)0)
 #endif
-#ifndef ANNOTATE_RWLOCK_RELEASED
-#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
+#ifndef VALGRIND_HG_ENABLE_CHECKING
+#define VALGRIND_HG_ENABLE_CHECKING(start, len) ((void)0)
 #endif
-
-#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
-	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
-			struct _starpu_spinlock *, lock, long, 0)
-#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
-			struct _starpu_spinlock *, lock)
-#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
-			struct _starpu_spinlock *, lock)
-#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
-			struct _starpu_spinlock *, lock)
+#define STARPU_HG_DISABLE_CHECKING(variable) VALGRIND_HG_DISABLE_CHECKING(&(variable), sizeof(variable))
+#define STARPU_HG_ENABLE_CHECKING(variable)  VALGRIND_HG_ENABLE_CHECKING(&(variable), sizeof(variable))
 
 #if defined(__KNC__) || defined(__KNF__)
 #define STARPU_DEBUG_PREFIX "[starpu-mic]"
@@ -80,6 +60,19 @@
 #define STARPU_DEBUG_PREFIX "[starpu]"
 #endif
 
+/* This is needed in some places to make valgrind yield to another thread to be
+ * able to progress.  */
+#if defined(__i386__) || defined(__x86_64__)
+#define _STARPU_UYIELD() __asm__ __volatile("rep; nop")
+#else
+#define _STARPU_UYIELD() ((void)0)
+#endif
+#if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
+#define STARPU_UYIELD() do { if (RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
+#else
+#define STARPU_UYIELD() _STARPU_UYIELD()
+#endif
+
 #ifdef STARPU_VERBOSE
 #  define _STARPU_DEBUG(fmt, ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%s] " fmt ,__starpu_func__ ,## __VA_ARGS__); fflush(stderr); }} while(0)
 #else

+ 8 - 1
src/core/dependencies/implicit_data_deps.c

@@ -481,8 +481,14 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
 
-	if (handle->post_sync_tasks_cnt > 0)
+	/* Here helgrind would shout that this is an unprotected access, but
+	 * count can only be zero if we don't have to care about
+	 * post_sync_tasks_cnt at all.  */
+	unsigned count = handle->post_sync_tasks_cnt;
+
+	if (count)
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 		if (--handle->post_sync_tasks_cnt == 0)
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
@@ -490,6 +496,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 			post_sync_tasks = handle->post_sync_tasks;
 			handle->post_sync_tasks = NULL;
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 	}
 
 	if (do_submit_tasks)

+ 1 - 1
src/core/dependencies/task_deps.c

@@ -70,9 +70,9 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 		STARPU_ASSERT_MSG(!job->submitted || !task->destroy || task->detach, "Task dependencies have to be set before submission (submitted %u destroy %d detach %d)", job->submitted, task->destroy, task->detach);
 	else
 		STARPU_ASSERT_MSG(job->terminated <= 1, "Task dependencies have to be set before termination (terminated %u)", job->terminated);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
 
 	struct _starpu_cg *cg = create_cg_task(ndeps, job);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
 
 	unsigned i;
 	for (i = 0; i < ndeps; i++)

+ 5 - 4
src/core/disk_ops/disk_stdio.c

@@ -55,10 +55,9 @@ starpu_stdio_alloc (void *base, size_t size)
 	int id = -1;
 
 	/* create template for mkstemp */
-	char * baseCpy = malloc(strlen(base)+8);
-	STARPU_ASSERT(baseCpy != NULL);
-
 	char * tmp = "STARPU_XXXXXX";
+	char * baseCpy = malloc(strlen(base)+1+strlen(tmp)+1);
+	STARPU_ASSERT(baseCpy != NULL);
 
 	strcpy(baseCpy, (char *) base);
 	strcat(baseCpy,"/");
@@ -294,7 +293,7 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	struct timeval end;
 	
 	srand (time (NULL)); 
-	char * buf = malloc(SIZE_DISK_MIN*sizeof(char));
+	char * buf = malloc(SIZE_DISK_MIN);
 	STARPU_ASSERT(buf != NULL);
 	
 	/* allocate memory */
@@ -304,6 +303,8 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 		return 0;
 	struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) mem;
 
+	memset(buf, 0, SIZE_DISK_MIN);
+
 	/* Measure upload slowness */
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; ++iter)

+ 2 - 1
src/core/perfmodel/perfmodel.c

@@ -262,7 +262,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		_starpu_spin_lock(&handle->header_lock);
 		handle->refcnt--;
 		handle->busy_count--;
-		_starpu_spin_unlock(&handle->header_lock);
+		if (!_starpu_data_check_not_busy(handle))
+			_starpu_spin_unlock(&handle->header_lock);
 		starpu_task_clean(conversion_task);
 		free(conversion_task);
 	}

+ 1 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -717,6 +717,7 @@ static void benchmark_all_gpu_devices(void)
 
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
+	hwloc_bitmap_free(former_cpuset);
 #elif __linux__
 	/* Restore the former affinity */
 	ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);

+ 27 - 16
src/core/perfmodel/perfmodel_history.c

@@ -255,6 +255,12 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 		{
 			entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
 			STARPU_ASSERT(entry);
+
+			/* Tell  helgrind that we do not care about
+			 * racing access to the sampling, we only want a
+			 * good-enough estimation */
+			STARPU_HG_DISABLE_CHECKING(entry->nsample);
+			STARPU_HG_DISABLE_CHECKING(entry->mean);
 		}
 
 		scan_history_entry(f, entry);
@@ -1210,13 +1216,15 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 		HASH_FIND_UINT32_T(history, &key, entry);
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
-		/* We do not care about racing access to the mean, we only want a
-		 * good-enough estimation, thus simulate taking the rdlock */
-		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
+		/* Here helgrind would shout that this is unprotected access.
+		 * We do not care about racing access to the mean, we only want
+		 * a good-enough estimation */
 
 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
 			exp = entry->history_entry->mean;
-		else if (!model->benchmarking)
+
+		STARPU_HG_DISABLE_CHECKING(model->benchmarking);
+		if (isnan(exp) && !model->benchmarking)
 		{
 			char archname[32];
 
@@ -1225,7 +1233,6 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}
-		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
 	}
 
 	return exp;
@@ -1233,7 +1240,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
 {
-	double exp;
+	double exp = NAN;
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_history_entry *entry;
 	struct starpu_perfmodel_history_table *history, *elt;
@@ -1248,18 +1255,17 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 	entry = (elt == NULL) ? NULL : elt->history_entry;
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
 
-	/* We do not care about racing access to the mean, we only want a
-	 * good-enough estimation, thus simulate taking the rdlock */
-	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
-
-	exp = entry?entry->mean:NAN;
+	/* Here helgrind would shout that this is unprotected access.
+	 * We do not care about racing access to the mean, we only want
+	 * a good-enough estimation */
 
-	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
+	if (entry && entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
 		/* TODO: report differently if we've scheduled really enough
 		 * of that task and the scheduler should perhaps put it aside */
-		/* Not calibrated enough */
-		exp = NAN;
+		/* Calibrated enough */
+		exp = entry->mean;
 
+	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
 	if (isnan(exp) && !model->benchmarking)
 	{
 		char archname[32];
@@ -1270,8 +1276,6 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 		model->benchmarking = 1;
 	}
 
-	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
-
 	return exp;
 }
 
@@ -1310,6 +1314,13 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				/* this is the first entry with such a footprint */
 				entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
 				STARPU_ASSERT(entry);
+
+				/* Tell  helgrind that we do not care about
+				 * racing access to the sampling, we only want a
+				 * good-enough estimation */
+				STARPU_HG_DISABLE_CHECKING(entry->nsample);
+				STARPU_HG_DISABLE_CHECKING(entry->mean);
+
 				entry->mean = measured;
 				entry->sum = measured;
 

+ 35 - 11
src/core/sched_policy.c

@@ -103,7 +103,8 @@ static struct starpu_sched_policy *find_sched_policy_from_name(const char *polic
 			}
 		}
 	}
-	fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
+	if (strcmp(policy_name, "help") != 0)
+	     fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
 
 	/* nothing was found */
 	return NULL;
@@ -117,12 +118,13 @@ static void display_sched_help_message(void)
 		/* display the description of all predefined policies */
 		struct starpu_sched_policy **policy;
 
-		fprintf(stderr, "STARPU_SCHED can be either of\n");
+		fprintf(stderr, "\nThe variable STARPU_SCHED can be set to one of the following strings:\n");
 		for(policy=predefined_policies ; *policy!=NULL ; policy++)
 		{
 			struct starpu_sched_policy *p = *policy;
 			fprintf(stderr, "%s\t-> %s\n", p->policy_name, p->policy_description);
 		}
+		fprintf(stderr, "\n");
 	 }
 }
 
@@ -630,18 +632,29 @@ pick:
 				}
 			}
 
-			if(!task && sched_ctx && worker->removed_from_ctx[sched_ctx->id])
+			if(!task)
 			{
-				_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
-				worker->removed_from_ctx[sched_ctx->id] = 0;
-			}
+				if(sched_ctx && worker->removed_from_ctx[sched_ctx->id])
+				{
+					_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
+					worker->removed_from_ctx[sched_ctx->id] = 0;
+				} 
+#ifdef STARPU_USE_SC_HYPERVISOR
+				else 
+				{
+					struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
+					if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle)
+						perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
+				}
+#endif //STARPU_USE_SC_HYPERVISOR
+					
 #ifndef STARPU_NON_BLOCKING_DRIVERS
-			if((!task && sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
-				break;
-
-
-			been_here[sched_ctx->id] = 1;
+				if((sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
+					break;
+				been_here[sched_ctx->id] = 1;
 #endif
+			}
+			
 			sched_ctx->pop_counter[worker->workerid]++;
 		}
 	  }
@@ -650,6 +663,17 @@ pick:
 	if (!task)
 		return NULL;
 
+
+
+#ifdef STARPU_USE_SC_HYPERVISOR
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
+
+	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_end)
+		perf_counters->notify_idle_end(task->sched_ctx, worker->workerid);
+#endif //STARPU_USE_SC_HYPERVISOR
+
+
 	/* Make sure we do not bother with all the multiformat-specific code if
 	 * it is not necessary. */
 	if (!_starpu_task_uses_multiformat_handles(task))

+ 8 - 0
src/core/task.c

@@ -160,6 +160,14 @@ void _starpu_task_destroy(struct starpu_task *task)
 		if (task->cl_arg_free)
 			free(task->cl_arg);
 
+		/* Does user want StarPU release callback_arg ? */
+		if (task->callback_arg_free)
+			free(task->callback_arg);
+
+		/* Does user want StarPU release prologue_callback_arg ? */
+		if (task->prologue_callback_arg_free)
+			free(task->prologue_callback_arg);
+
 		free(task);
 	}
 }

+ 69 - 47
src/core/workers.c

@@ -383,7 +383,68 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 struct itimerval prof_itimer;
 #endif
 
-void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
+static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu_machine_config *pconfig)
+{
+	workerarg->config = pconfig;
+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
+	/* arch initialized by topology.c */
+	/* worker_mask initialized by topology.c */
+	/* perf_arch initialized by topology.c */
+	/* worker_thread initialized by _starpu_launch_drivers */
+	/* mp_nodeid initialized by topology.c */
+	/* devid initialized by topology.c */
+	/* bindid initialized by topology.c */
+	/* workerid initialized by topology.c */
+	workerarg->combined_workerid = workerarg->workerid;
+	workerarg->current_rank = 0;
+	workerarg->worker_size = 1;
+	STARPU_PTHREAD_COND_INIT(&workerarg->started_cond, NULL);
+	STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
+	/* memory_node initialized by topology.c */
+	STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
+	starpu_task_list_init(&workerarg->local_tasks);
+	workerarg->current_task = NULL;
+	workerarg->set = NULL;
+
+	/* if some codelet's termination cannot be handled directly :
+	 * for instance in the Gordon driver, Gordon tasks' callbacks
+	 * may be executed by another thread than that of the Gordon
+	 * driver so that we cannot call the push_codelet_output method
+	 * directly */
+	workerarg->terminated_jobs = _starpu_job_list_new();
+
+	workerarg->worker_is_running = 0;
+	workerarg->worker_is_initialized = 0;
+	workerarg->status = STATUS_INITIALIZING;
+	/* name initialized by driver */
+	/* short_name initialized by driver */
+	workerarg->run_by_starpu = 1;
+
+	workerarg->sched_ctx_list = NULL;
+	workerarg->nsched_ctxs = 0;
+	_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
+
+	workerarg->has_prev_init = 0;
+
+	int ctx;
+	for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
+		workerarg->removed_from_ctx[ctx] = 0;
+
+	workerarg->spinning_backoff = 1;
+
+	STARPU_PTHREAD_COND_INIT(&workerarg->parallel_sect_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->parallel_sect_mutex, NULL);
+
+	workerarg->parallel_sect = 0;
+
+	for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
+		workerarg->shares_tasks_lists[ctx] = 0;
+
+	/* cpu_set/hwloc_cpu_set initialized in topology.c */
+}
+
+void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key)
 {
 	(void) fut_key;
 	int devid = worker->devid;
@@ -415,7 +476,6 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 	worker->worker_is_running = 1;
 	STARPU_PTHREAD_COND_SIGNAL(&worker->started_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
-	worker->spinning_backoff = 1;
 }
 
 static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
@@ -446,51 +506,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 		unsigned mp_nodeid = workerarg->mp_nodeid;
 #endif
 
-		workerarg->config = pconfig;
-
-		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
-
-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
-		STARPU_PTHREAD_COND_INIT(&workerarg->started_cond, NULL);
-		STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
-
-		workerarg->worker_size = 1;
-		workerarg->combined_workerid = workerarg->workerid;
-		workerarg->current_rank = 0;
-		workerarg->has_prev_init = 0;
-		/* mutex + cond only for the local list */
-		/* we have a single local list */
-		/* afterwards there would be a mutex + cond for the list of each strategy */
-		workerarg->run_by_starpu = 1;
-		workerarg->worker_is_running = 0;
-		workerarg->worker_is_initialized = 0;
-		workerarg->set = NULL;
-
-		int ctx;
-		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
-		{
-			workerarg->removed_from_ctx[ctx] = 0;
-			workerarg->shares_tasks_lists[ctx] = 0;
-		}
-
-
-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
-		STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->parallel_sect_mutex, NULL);
-		STARPU_PTHREAD_COND_INIT(&workerarg->parallel_sect_cond, NULL);
-		workerarg->parallel_sect = 0;
-
-		/* if some codelet's termination cannot be handled directly :
-		 * for instance in the Gordon driver, Gordon tasks' callbacks
-		 * may be executed by another thread than that of the Gordon
-		 * driver so that we cannot call the push_codelet_output method
-		 * directly */
-		workerarg->terminated_jobs = _starpu_job_list_new();
-
-		starpu_task_list_init(&workerarg->local_tasks);
-
-		workerarg->status = STATUS_INITIALIZING;
-
 		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
 
 		_starpu_init_worker_queue(workerarg);
@@ -857,6 +872,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 {
 	int is_a_sink = 0; /* Always defined. If the MP infrastructure is not
 			    * used, we cannot be a sink. */
+	unsigned worker;
 #ifdef STARPU_USE_MP
 	_starpu_set_argc_argv(argc, argv);
 
@@ -884,6 +900,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	_STARPU_DISP("Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
 #endif
 #endif
+#ifdef STARPU_SPINLOCK_CHECK
+	_STARPU_DISP("Warning: StarPU was configured with --enable-spinlock-check, which slows down a bit\n");
+#endif
 #if 0
 #ifndef STARPU_NO_ASSERT
 	_STARPU_DISP("Warning: StarPU was configured without --enable-fast\n");
@@ -1005,6 +1024,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	 * threads */
 	_starpu_initialize_current_task_key();
 
+	for (worker = 0; worker < config.topology.nworkers; worker++)
+		_starpu_worker_init(&config.workers[worker], &config);
+
 	if (!is_a_sink)
 	{
 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf->sched_policy_name);

+ 2 - 1
src/core/workers.h

@@ -52,6 +52,7 @@
 
 #include <starpu_parameters.h>
 
+/* This is initialized from in _starpu_worker_init */
 struct _starpu_worker
 {
 	struct _starpu_machine_config *config;
@@ -341,7 +342,7 @@ void _starpu_block_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthr
 void _starpu_set_local_worker_key(struct _starpu_worker *worker);
 
 /* This function initializes the current thread for the given worker */
-void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key);
+void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key);
 
 /* Returns the _starpu_worker structure that describes the state of the
  * current worker. */

+ 1 - 0
src/datawizard/coherency.h

@@ -178,6 +178,7 @@ struct _starpu_data_state
 	unsigned long last_submitted_ghost_sync_id;
 	struct _starpu_jobid_list *last_submitted_ghost_accessors_id;
 
+	/* protected by sequential_consistency_mutex */
 	struct _starpu_task_wrapper_list *post_sync_tasks;
 	unsigned post_sync_tasks_cnt;
 

+ 52 - 16
src/datawizard/data_request.c

@@ -20,6 +20,13 @@
 #include <common/utils.h>
 #include <datawizard/datawizard.h>
 
+/* TODO: This should be tuned according to driver capabilities
+ * Data interfaces should also have to declare how many asynchronous requests
+ * they have actually started (think of e.g. csr).
+ */
+#define MAX_PENDING_REQUESTS_PER_NODE 400
+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 200
+
 /* requests that have not been treated at all */
 static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_list *prefetch_requests[STARPU_MAXNODES];
@@ -27,6 +34,7 @@ static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
 /* requests that are not terminated (eg. async transfers) */
 static struct _starpu_data_request_list *data_requests_pending[STARPU_MAXNODES];
+static unsigned data_requests_npending[STARPU_MAXNODES];
 static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
 
 void _starpu_init_data_request_lists(void)
@@ -36,11 +44,19 @@ void _starpu_init_data_request_lists(void)
 	{
 		prefetch_requests[i] = _starpu_data_request_list_new();
 		data_requests[i] = _starpu_data_request_list_new();
+
+		/* Tell helgrind that we are fine with checking for list_empty
+		 * in _starpu_handle_node_data_requests, we will call it
+		 * periodically anyway */
+		STARPU_HG_DISABLE_CHECKING(data_requests[i]->_head);
+
 		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
 
 		data_requests_pending[i] = _starpu_data_request_list_new();
+		data_requests_npending[i] = 0;
 		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
 	}
+	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
 }
 
 void _starpu_deinit_data_request_lists(void)
@@ -380,6 +396,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
 		_starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
+		data_requests_npending[r->handling_node]++;
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
 
 		return -EAGAIN;
@@ -397,18 +414,11 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 
-	/* Note: we here tell valgrind that list_empty (reading a pointer) is
-	 * as safe as if we had the lock held */
-	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
-	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
+	/* Here helgrind would should that this is an un protected access.
+	 * We however don't care about missing an entry, we will get called
+	 * again sooner or later. */
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
-	{
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 		return;
-	}
-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
 
 	/* take all the entries from the request list */
         STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
@@ -427,6 +437,7 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	 * requests, and we handle the request(s) one by one in the former
 	 * list, without concurrency issues.*/
 	data_requests[src_node] = _starpu_data_request_list_new();
+	STARPU_HG_DISABLE_CHECKING(data_requests[src_node]->_head);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
@@ -437,15 +448,29 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 	{
                 int res;
 
+		if (data_requests_npending[src_node] >= MAX_PENDING_REQUESTS_PER_NODE)
+		{
+			/* Too many requests at the same time, skip pushing
+			 * more for now */
+			break;
+		}
+
 		r = _starpu_data_request_list_pop_front(local_list);
 
 		res = starpu_handle_data_request(r, may_alloc);
 		if (res == -ENOMEM)
 		{
 			_starpu_data_request_list_push_back(new_data_requests, r);
+			break;
 		}
 	}
 
+	while (!_starpu_data_request_list_empty(local_list))
+	{
+		r = _starpu_data_request_list_pop_front(local_list);
+		_starpu_data_request_list_push_back(new_data_requests, r);
+	}
+
 	if (!_starpu_data_request_list_empty(new_data_requests))
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
@@ -493,6 +518,13 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 	{
                 int res;
 
+		if (data_requests_npending[src_node] >= MAX_PENDING_PREFETCH_REQUESTS_PER_NODE)
+		{
+			/* Too many requests at the same time, skip pushing
+			 * more for now */
+			break;
+		}
+
 		r = _starpu_data_request_list_pop_front(local_list);
 
 		res = starpu_handle_data_request(r, may_alloc);
@@ -539,6 +571,7 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
 	struct _starpu_data_request_list *new_data_requests_pending;
+	unsigned taken, kept;
 
 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
 		return;
@@ -558,11 +591,14 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 
 	new_data_requests_pending = _starpu_data_request_list_new();
+	taken = 0;
+	kept = 0;
 
 	while (!_starpu_data_request_list_empty(local_list))
 	{
 		struct _starpu_data_request *r;
 		r = _starpu_data_request_list_pop_front(local_list);
+		taken++;
 
 		starpu_data_handle_t handle = r->handle;
 
@@ -592,15 +628,15 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 				_starpu_spin_unlock(&handle->header_lock);
 
 				_starpu_data_request_list_push_back(new_data_requests_pending, r);
+				kept++;
 			}
 		}
 	}
-	if (!_starpu_data_request_list_empty(new_data_requests_pending))
-	{
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	data_requests_npending[src_node] -= taken - kept;
+	if (kept)
 		_starpu_data_request_list_push_list_back(data_requests_pending[src_node], new_data_requests_pending);
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
-	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 
 	_starpu_data_request_list_delete(local_list);
 	_starpu_data_request_list_delete(new_data_requests_pending);
@@ -626,7 +662,7 @@ int _starpu_check_that_no_data_request_exists(unsigned node)
 	no_request = _starpu_data_request_list_empty(data_requests[node]);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
-	no_pending = _starpu_data_request_list_empty(data_requests_pending[node]);
+	no_pending = !data_requests_npending[node];
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
 
 	return (no_request && no_pending);

+ 23 - 1
src/datawizard/filters.c

@@ -191,6 +191,8 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		child->last_sync_task = NULL;
 		child->last_submitted_accessors = NULL;
 		child->post_sync_tasks = NULL;
+		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
+		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);
 		child->post_sync_tasks_cnt = 0;
 
 		/* The methods used for reduction are propagated to the
@@ -274,6 +276,8 @@ void _starpu_empty_codelet_function(void *buffers[], void *args)
 void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gathering_node)
 {
 	unsigned child;
+	unsigned worker;
+	unsigned nworkers = starpu_worker_get_count();
 	unsigned node;
 	unsigned sizes[root_handle->nchildren];
 
@@ -315,16 +319,34 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		}
 
 		int ret;
-		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
 		/* for now we pretend that the RAM is almost unlimited and that gathering
 		 * data should be possible from the node that does the unpartionning ... we
 		 * don't want to have the programming deal with memory shortage at that time,
 		 * really */
+		if (child_handle->current_mode == STARPU_REDUX)
+		{
+			/* Acquire the child data on the gathering node. This will trigger collapsing the reduction */
+			ret = starpu_data_acquire_on_node(child_handle, gathering_node, STARPU_RW);
+			_starpu_unlock_post_sync_tasks(child_handle);
+		} else
+		{
+			/* Simply transfer any pending data */
+			ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
+		}
 		STARPU_ASSERT(ret == 0);
 
 		_starpu_spin_lock(&child_handle->header_lock);
 
 		_starpu_data_free_interfaces(child_handle);
+
+		for (worker = 0; worker < nworkers; worker++)
+		{
+			struct _starpu_data_replicate *local = &child_handle->per_worker[worker];
+			STARPU_ASSERT(local->state == STARPU_INVALID);
+			if (local->allocated && local->automatically_allocated)
+				_starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]);
+		}
+
 		_starpu_memory_stats_free(child_handle);
 		_starpu_data_requester_list_delete(child_handle->req_list);
 		_starpu_data_requester_list_delete(child_handle->reduction_req_list);

+ 14 - 9
src/datawizard/interfaces/data_interface.c

@@ -200,6 +200,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	handle->last_sync_task = NULL;
 	handle->last_submitted_accessors = NULL;
 	handle->post_sync_tasks = NULL;
+
+	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
+	STARPU_HG_DISABLE_CHECKING(handle->post_sync_tasks_cnt);
 	handle->post_sync_tasks_cnt = 0;
 
 	/* By default, there are no methods available to perform a reduction */
@@ -292,6 +295,10 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 	unsigned node;
 	unsigned worker;
 
+	/* Tell helgrind that our access to busy_count in
+	 * starpu_data_unregister is actually safe */
+	STARPU_HG_DISABLE_CHECKING(handle->busy_count);
+
 	handle->ops = interface_ops;
 	handle->mf_node = mf_node;
 
@@ -421,7 +428,8 @@ int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 	entry = (struct handle_tag_entry *) malloc(sizeof(*entry));
 	STARPU_ASSERT(entry != NULL);
 
-	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),"data handle %p already has tag %d\n", starpu_data_get_data_handle_from_tag(tag), tag);
+	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),
+			  "There is already a data handle %p registered with the tag %d\n", starpu_data_get_data_handle_from_tag(tag), tag);
 
 	entry->tag = tag;
 	entry->handle = handle;
@@ -677,14 +685,11 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
 	while (1) {
 		int busy;
-		/* Note: we here tell valgrind that reading busy_count is as
-		 * safe is if we had the lock held */
-		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
-		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
-		busy = handle->busy_count;
-		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
-		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
-		if (!busy)
+		/* Here helgrind would shout that this an unprotected access,
+		 * but this is actually fine: all threads who do busy_count--
+		 * are supposed to call _starpu_data_check_not_busy, which will
+		 * wake us up through the busy_mutex/busy_cond. */
+		if (!handle->busy_count)
 			break;
 		/* This is woken by _starpu_data_check_not_busy, always called
 		 * after decrementing busy_count */

+ 1 - 5
src/datawizard/user_interactions.c

@@ -230,11 +230,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
         _STARPU_LOG_IN();
 
 	/* unless asynchronous, it is forbidden to call this function from a callback or a codelet */
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
-	{
-                _STARPU_LOG_OUT_TAG("EDEADLK");
-		return -EDEADLK;
-        }
+	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "Acquiring a data synchronously is not possible from a codelet or from a task callback, use starpu_data_acquire_cb instead.");
 
 	if (_starpu_data_is_multiformat_handle(handle) &&
 	    _starpu_handle_needs_conversion_task(handle, 0))

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -238,7 +238,7 @@ int _starpu_cpu_driver_init(struct starpu_driver *d)
 
 	int devid = cpu_worker->devid;
 
-	_starpu_worker_init(cpu_worker, _STARPU_FUT_CPU_KEY);
+	_starpu_worker_start(cpu_worker, _STARPU_FUT_CPU_KEY);
 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
 	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
 

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -391,7 +391,7 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 	STARPU_ASSERT(args);
 	unsigned devid = args->devid;
 
-	_starpu_worker_init(args, _STARPU_FUT_CUDA_KEY);
+	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
 
 #ifndef STARPU_SIMGRID
 	init_context(devid);

+ 0 - 27
src/drivers/driver_common/driver_common.c

@@ -241,38 +241,11 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 			}
 		}
 
-#ifdef STARPU_USE_SC_HYPERVISOR
-		struct _starpu_sched_ctx *sched_ctx = NULL;
-		struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
-		struct _starpu_sched_ctx_list *l = NULL;
-		for (l = args->sched_ctx_list; l; l = l->next)
-		{
-			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
-			if(sched_ctx->id != 0)
-			{
-				perf_counters = sched_ctx->perf_counters;
-				if(perf_counters != NULL && perf_counters->notify_idle_cycle)
-				{
-					perf_counters->notify_idle_cycle(sched_ctx->id, args->workerid, 1.0);
-					
-				}
-			}
-		}
-#endif //STARPU_USE_SC_HYPERVISOR
-
 		return NULL;
 	}
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
-#ifdef STARPU_USE_SC_HYPERVISOR
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
-	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
-
-	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_end)
-		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
-#endif //STARPU_USE_SC_HYPERVISOR
-
 	_starpu_worker_set_status_wakeup(workerid);
 	args->spinning_backoff = BACKOFF_MIN;
 

+ 1 - 1
src/drivers/mic/driver_mic_source.c

@@ -517,7 +517,7 @@ void *_starpu_mic_src_worker(void *arg)
 
 	/* unsigned memnode = baseworker->memory_node; */
 
-	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
+	_starpu_worker_start(baseworker, _STARPU_FUT_MIC_KEY);
 
 	// Current task for a thread managing a worker set has no sense.
 	_starpu_set_current_task(NULL);

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -599,7 +599,7 @@ int _starpu_opencl_driver_init(struct starpu_driver *d)
 	STARPU_ASSERT(args);
 	int devid = args->devid;
 
-	_starpu_worker_init(args, _STARPU_FUT_OPENCL_KEY);
+	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
 
 #ifndef STARPU_SIMGRID
 	_starpu_opencl_init_context(devid);

+ 1 - 1
src/drivers/scc/driver_scc_source.c

@@ -291,7 +291,7 @@ void *_starpu_scc_src_worker(void *arg)
 	unsigned mp_nodeid = args->mp_nodeid;
 	unsigned i;
 
-	_starpu_worker_init(args, _STARPU_FUT_SCC_KEY);
+	_starpu_worker_start(args, _STARPU_FUT_SCC_KEY);
 
 	_starpu_scc_src_init_context(devid);
 

+ 14 - 10
src/sched_policies/eager_central_policy.c

@@ -42,6 +42,11 @@ static void initialize_eager_center_policy(unsigned sched_ctx_id)
 	/* there is only a single queue in that trivial design */
 	data->fifo =  _starpu_create_fifo();
 
+	 /* Tell helgrind that it's fine to check for empty fifo in
+	  * pop_task_eager_policy without actual mutex (it's just an integer)
+	  */
+	STARPU_HG_DISABLE_CHECKING(data->fifo->ntasks);
+
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
 }
@@ -117,19 +122,12 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
 	struct starpu_task *task = NULL;
 
-	/* Tell helgrind that it's fine to check for empty fifo without actual
-	 * mutex (it's just a pointer) */
-	VALGRIND_HG_MUTEX_LOCK_PRE(&data->policy_mutex, 0);
-	VALGRIND_HG_MUTEX_LOCK_POST(&data->policy_mutex);
 	/* block until some event happens */
+	/* Here helgrind would shout that this is unprotected, this is just an
+	 * integer access, and we hold the sched mutex, so we can not miss any
+	 * wake up. */
 	if (_starpu_fifo_empty(data->fifo))
-	{
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
 		return NULL;
-	}
-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	task = _starpu_fifo_pop_task(data->fifo, workerid);
@@ -147,6 +145,12 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 	{
 		workerid = workerids[i];
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
+
+		starpu_wakeup_worker(workerid, sched_cond, sched_mutex);
 	}
 }
 

+ 8 - 10
src/sched_policies/eager_central_priority_policy.c

@@ -84,6 +84,11 @@ static void initialize_eager_center_priority_policy(unsigned sched_ctx_id)
 
 	/* only a single queue (even though there are several internaly) */
 	data->taskq = _starpu_create_priority_taskq();
+
+	/* Tell helgrind that it's fine to check for empty fifo in
+	 * _starpu_priority_pop_task without actual mutex (it's just an
+	 * integer) */
+	STARPU_HG_DISABLE_CHECKING(data->taskq->total_ntasks);
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
 
@@ -154,19 +159,12 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
 	struct _starpu_priority_taskq *taskq = data->taskq;
 
-	/* Tell helgrind that it's fine to check for empty fifo without actual
-	 * mutex (it's just a pointer) */
-	VALGRIND_HG_MUTEX_LOCK_PRE(&data->policy_mutex, 0);
-	VALGRIND_HG_MUTEX_LOCK_POST(&data->policy_mutex);
 	/* block until some event happens */
+	/* Here helgrind would shout that this is unprotected, this is just an
+	 * integer access, and we hold the sched mutex, so we can not miss any
+	 * wake up. */
 	if (taskq->total_ntasks == 0)
-	{
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
 		return NULL;
-	}
-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
 
 	/* release this mutex before trying to wake up other workers */
 	starpu_pthread_mutex_t *curr_sched_mutex;

+ 20 - 23
src/sched_policies/parallel_heft.c

@@ -112,14 +112,14 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 	if (!starpu_worker_is_combined_worker(best_workerid))
 	{
-		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
-		/* TODO */
-		task->predicted_transfer = 0;
 		starpu_pthread_mutex_t *sched_mutex;
 		starpu_pthread_cond_t *sched_cond;
 		starpu_worker_get_sched_condition(best_workerid, &sched_mutex, &sched_cond);
 
 		STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
+		/* TODO */
+		task->predicted_transfer = 0;
 		worker_exp_len[best_workerid] += task->predicted;
 		worker_exp_end[best_workerid] = exp_end_predicted;
 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
@@ -196,11 +196,10 @@ static double compute_expected_end(int workerid, double length)
 		double res;
 		/* This is a basic worker */
 
-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
+		/* Here helgrind would shout that this is unprotected, but we
+		 * are fine with getting outdated values, this is just an
+		 * estimation */
 		res = worker_exp_start[workerid] + worker_exp_len[workerid] + length;
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
 
 		return res;
 	}
@@ -213,9 +212,9 @@ static double compute_expected_end(int workerid, double length)
 
 		double exp_end = DBL_MIN;
 
-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
-
+		/* Here helgrind would shout that this is unprotected, but we
+		 * are fine with getting outdated values, this is just an
+		 * estimation */
 		int i;
 		for (i = 0; i < worker_size; i++)
 		{
@@ -225,9 +224,6 @@ static double compute_expected_end(int workerid, double length)
 			exp_end = STARPU_MAX(exp_end, local_exp_end);
 		}
 
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
-
 		return exp_end;
 	}
 }
@@ -245,11 +241,10 @@ static double compute_ntasks_end(int workerid)
 		double res;
 		/* This is a basic worker */
 
-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
+		/* Here helgrind would shout that this is unprotected, but we
+		 * are fine with getting outdated values, this is just an
+		 * estimation */
 		res = ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
 
 		return res;
 	}
@@ -262,9 +257,9 @@ static double compute_ntasks_end(int workerid)
 
 		int ntasks_end=0;
 
-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
-
+		/* Here helgrind would shout that this is unprotected, but we
+		 * are fine with getting outdated values, this is just an
+		 * estimation */
 		int i;
 		for (i = 0; i < worker_size; i++)
 		{
@@ -272,9 +267,6 @@ static double compute_ntasks_end(int workerid)
 			ntasks_end = STARPU_MAX(ntasks_end, (int) ((double) ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch)));
 		}
 
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
-
 		return ntasks_end;
 	}
 }
@@ -580,6 +572,11 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
 	STARPU_PTHREAD_MUTEX_INIT(&hd->global_push_mutex, NULL);
 
+	/* Tell helgrind that we are fine with getting outdated values when
+	 * estimating schedules */
+	STARPU_HG_DISABLE_CHECKING(worker_exp_start);
+	STARPU_HG_DISABLE_CHECKING(worker_exp_len);
+	STARPU_HG_DISABLE_CHECKING(ntasks);
 }
 
 static void parallel_heft_deinit(unsigned sched_ctx_id)

+ 8 - 4
src/sched_policies/work_stealing_policy.c

@@ -73,11 +73,10 @@ static unsigned select_victim_round_robin(unsigned sched_ctx_id)
 		unsigned njobs;
 
 		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
-		VALGRIND_HG_MUTEX_LOCK_PRE(victim_sched_mutex, 0);
-		VALGRIND_HG_MUTEX_LOCK_POST(victim_sched_mutex);
+		/* Here helgrind would shout that this is unprotected, but we
+		 * are fine with getting outdated values, this is just an
+		 * estimation */
 		njobs = ws->queue_array[worker]->njobs;
-		VALGRIND_HG_MUTEX_UNLOCK_PRE(victim_sched_mutex);
-		VALGRIND_HG_MUTEX_UNLOCK_POST(victim_sched_mutex);
 
 		if (njobs)
 			break;
@@ -402,6 +401,11 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 		workerid = workerids[i];
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		ws->queue_array[workerid] = _starpu_create_deque();
+
+		/* Tell helgrid that we are fine with getting outdated values,
+		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->njobs);
+
 		/**
 		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
 		 * we need to initialize it at -1.

+ 33 - 5
src/util/starpu_insert_task.c

@@ -21,7 +21,7 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <stdarg.h>
-#include <util/starpu_insert_task_utils.h>
+#include <util/starpu_task_insert_utils.h>
 
 void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 {
@@ -29,7 +29,7 @@ void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 
 	/* Compute the size */
 	va_start(varg_list, arg_buffer_size);
-	*arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
+	*arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
 
 	va_start(varg_list, arg_buffer_size);
 	_starpu_codelet_pack_args(arg_buffer, *arg_buffer_size, varg_list);
@@ -62,7 +62,7 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 	va_end(varg_list);
 }
 
-int starpu_insert_task(struct starpu_codelet *cl, ...)
+int starpu_task_insert(struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
 	void *arg_buffer = NULL;
@@ -70,7 +70,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	/* Compute the size */
 	size_t arg_buffer_size = 0;
 	va_start(varg_list, cl);
-	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
+	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
 
 	if (arg_buffer_size)
 	{
@@ -87,7 +87,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	}
 
 	va_start(varg_list, cl);
-	int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
+	int ret = _starpu_task_insert_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
 
 	if (ret == -ENODEV)
 	{
@@ -96,3 +96,31 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	}
 	return ret;
 }
+
+struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
+{
+	va_list varg_list;
+	void *arg_buffer = NULL;
+
+	/* Compute the size */
+	size_t arg_buffer_size = 0;
+	va_start(varg_list, cl);
+	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
+
+	if (arg_buffer_size)
+	{
+		va_start(varg_list, cl);
+		_starpu_codelet_pack_args(&arg_buffer, arg_buffer_size, varg_list);
+	}
+
+	struct starpu_task *task = starpu_task_create();
+
+	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
+	{
+		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
+	}
+
+	va_start(varg_list, cl);
+	_starpu_task_insert_create(arg_buffer, arg_buffer_size, cl, &task, varg_list);
+	return task;
+}

+ 0 - 0
src/util/starpu_insert_task_utils.c


Some files were not shown because too many files changed in this diff