12 years ago · 483f8e2979
--- a/ChangeLog
+++ b/ChangeLog
@@ -52,8 +52,6 @@ New features:
 
				     scheduled.
			
 
				 
			
 
				 Small features:
			
 
				-  * Add cl_arg_free field to enable automatic free(cl_arg) on task
			
 
				-    destroy.
			
 
				   * New functions starpu_data_acquire_cb_sequential_consistency() and
			
 
				     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
			
 
				     to enable or disable sequential consistency
			
@@ -64,6 +62,13 @@ Small features:
 
				     the tool starpu_perfmodel_display
			
 
				   * New batch files to execute StarPU applications under Microsoft
			
 
				     Visual Studio (They are installed in path_to_starpu/bin/mvsc)/
			
 
				+  * Add cl_arg_free, callback_arg_free, prologue_callback_arg_free fields to
			
 
				+    enable automatic free(cl_arg); free(callback_arg);
			
 
				+    free(prologue_callback_arg) on task destroy.
			
 
				+  * New function starpu_task_build
			
 
				+  * Functions starpu_insert_task and starpu_mpi_insert_task are
			
 
				+    renamed in starpu_task_insert and starpu_mpi_task_insert. Old
			
 
				+    names are kept to avoid breaking old codes.
			
 
				 
			
 
				 Changes:
			
 
				   * Fix of the livelock issue discovered while executing applications
			
@@ -72,6 +77,11 @@ Changes:
 
				   * Data interfaces (variable, vector, matrix and block) now define
			
 
				     pack und unpack functions
			
 
				   * Fix for properly dealing with NAN on windows systems
			
 
				+  * StarPU-MPI: Fix for being able to receive data which have not yet
			
 
				+    been registered by the application (i.e it did not call
			
 
				+    starpu_data_set_tag(), data are received as a raw memory)
			
 
				+  * StarPU-MPI: Fix for being able to receive data with the same tag
			
 
				+    from several nodes (see mpi/tests/gather.c)
			
 
				 
			
 
				 StarPU 1.1.0 (svn revision xxxx)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -54,8 +54,8 @@ AC_CANONICAL_SYSTEM
 
				 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
			
 
				 dnl when they're available.
			
 
				 m4_ifdef([AM_SILENT_RULES],
			
 
				-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
			
 
				-  [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
			
 
				+  [AM_INIT_AUTOMAKE([1.11 -Wall foreign silent-rules color-tests parallel-tests])],
			
 
				+  [AM_INIT_AUTOMAKE([1.10 -Wall foreign])])
			
 
				 
			
 
				 m4_ifdef([AM_SILENT_RULES],
			
 
				   [AM_SILENT_RULES(yes)])
			
@@ -258,6 +258,8 @@ AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to
 
				 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
			
 
				 AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
			
 
				 
			
 
				+AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
			
 
				+
			
 
				 AC_CHECK_HEADERS([aio.h])
			
 
				 
			
 
				 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
			
@@ -1291,14 +1293,20 @@ AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug], [enable debug mode])],
 
				 			enable_debug=$enableval, enable_debug=no)
			
 
				 AC_MSG_RESULT($enable_debug)
			
 
				 
			
 
				+AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
			
 
				+
			
 
				 if test x$enable_debug = xyes; then
			
 
				 	CFLAGS="$CFLAGS -O0"
			
 
				-	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
			
 
				+	enable_spinlock_check=yes
			
 
				 else
			
 
				 	CFLAGS="-O3 $CFLAGS"
			
 
				 fi
			
 
				 CFLAGS+=" -gdwarf-2 -g3 "
			
 
				 
			
 
				+if test x$enable_spinlock_check = xyes; then
			
 
				+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
			
 
				+fi
			
 
				+
			
 
				 AC_MSG_CHECKING(whether extra checks should be performed)
			
 
				 AC_ARG_ENABLE(fast, [AS_HELP_STRING([--enable-fast],
			
 
				 			[do not enforce assertions])],
			
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -481,7 +481,7 @@ to a less optimal solution. This increases even more computation time.
 
				 
			
 
				 \section InsertTaskUtility Insert Task Utility
			
 
				 
			
 
				-StarPU provides the wrapper function starpu_insert_task() to ease
			
 
				+StarPU provides the wrapper function starpu_task_insert() to ease
			
 
				 the creation and submission of tasks.
			
 
				 
			
 
				 Here the implementation of the codelet:
			
@@ -508,17 +508,17 @@ struct starpu_codelet mycodelet = {
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-And the call to the function starpu_insert_task():
			
 
				+And the call to the function starpu_task_insert():
			
 
				 
			
 
				 \code{.c}
			
 
				-starpu_insert_task(&mycodelet,
			
 
				+starpu_task_insert(&mycodelet,
			
 
				                    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				                    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				                    0);
			
 
				 \endcode
			
 
				 
			
 
				-The call to starpu_insert_task() is equivalent to the following
			
 
				+The call to starpu_task_insert() is equivalent to the following
			
 
				 code:
			
 
				 
			
 
				 \code{.c}
			
@@ -540,7 +540,7 @@ int ret = starpu_task_submit(task);
 
				 Here a similar call using ::STARPU_DATA_ARRAY.
			
 
				 
			
 
				 \code{.c}
			
 
				-starpu_insert_task(&mycodelet,
			
 
				+starpu_task_insert(&mycodelet,
			
 
				                    STARPU_DATA_ARRAY, data_handles, 2,
			
 
				                    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				                    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
@@ -554,11 +554,11 @@ instance, assuming that the index variable <c>i</c> was registered as handle
 
				 
			
 
				 \code{.c}
			
 
				 /* Compute which portion we will work on, e.g. pivot */
			
 
				-starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
			
 
				+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
			
 
				 
			
 
				 /* And submit the corresponding task */
			
 
				 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
			
 
				-                       starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
			
 
				+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
			
 
				 \endcode
			
 
				 
			
 
				 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
			
@@ -637,7 +637,7 @@ dot products with partitioned vectors:
 
				 
			
 
				 \code{.c}
			
 
				 for (b = 0; b < nblocks; b++)
			
 
				-    starpu_insert_task(&dot_kernel_cl,
			
 
				+    starpu_task_insert(&dot_kernel_cl,
			
 
				         STARPU_REDUX, dtq_handle,
			
 
				         STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				         STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
@@ -659,9 +659,9 @@ the initial status <c>register(NULL)</c>.
 
				 The example <c>cg</c> also uses reduction for the blocked gemv kernel,
			
 
				 leading to yet more relaxed dependencies and more parallelism.
			
 
				 
			
 
				-::STARPU_REDUX can also be passed to starpu_mpi_insert_task() in the MPI
			
 
				+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
			
 
				 case. That will however not produce any MPI communication, but just pass
			
 
				-::STARPU_REDUX to the underlying starpu_insert_task(). It is up to the
			
 
				+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
			
 
				 application to call starpu_mpi_redux_data(), which posts tasks that will
			
 
				 reduce the partial results among MPI nodes into the MPI node which owns the
			
 
				 data. For instance, some hypothetical application which collects partial results
			
@@ -670,11 +670,11 @@ with a new reduction:
 
				 
			
 
				 \code{.c}
			
 
				 for (i = 0; i < 100; i++) {
			
 
				-    starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
			
 
				                STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
			
 
				-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				 }
			
 
				 \endcode
			
 
				 
			
@@ -705,9 +705,9 @@ unregistration.
 
				 
			
 
				 \code{.c}
			
 
				 starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				-starpu_insert_task(&produce_data, STARPU_W, handle, 0);
			
 
				-starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
			
 
				-starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
			
 
				+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
			
 
				+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				 starpu_data_unregister_submit(handle);
			
 
				 \endcode
			
 
				 
			
@@ -725,7 +725,7 @@ provides per-worker buffers without content consistency.
 
				 \code{.c}
			
 
				 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
 
				 for (i = 0; i < N; i++)
			
 
				-    starpu_insert_task(&compute, STARPU_R, input[i],
			
 
				+    starpu_task_insert(&compute, STARPU_R, input[i],
			
 
				                        STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				 \endcode
			
 
				 
			
@@ -1028,7 +1028,7 @@ starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
 
				                             output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				 /* The handle can now be used as usual */
			
 
				-starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				+starpu_task_insert(&cl, STARPU_RW, handle, 0);
			
 
				 
			
 
				 /* ... */
			
 
				 
			
@@ -1122,7 +1122,7 @@ Complex data interfaces can then be registered to StarPU.
 
				 \code{.c}
			
 
				 double real = 45.0;
			
 
				 double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
			
 
				-starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
			
 
				+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
			
 
				 \endcode
			
 
				 
			
 
				 and used by codelets.
			
@@ -1186,7 +1186,7 @@ for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 
				 {
			
 
				 	handles[i] = handle;
			
 
				 }
			
 
				-starpu_insert_task(&dummy_big_cl,
			
 
				+starpu_task_insert(&dummy_big_cl,
			
 
				         	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
			
 
				 		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
			
 
				 		 0);
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -406,20 +406,32 @@ Optional field, the default value is <c>NULL</c>. This is the pointer
 
				 passed to the callback function. This field is ignored if the field
			
 
				 starpu_task::callback_func is set to <c>NULL</c>.
			
 
				 
			
 
				-\var starpu_task::prologue_func
			
 
				+\var starpu_task::callback_arg_free
			
 
				+Optional field. In case starpu_task::callback_arg was allocated by the
			
 
				+application through <c>malloc()</c>, setting starpu_task::callback_arg_free
			
 
				+to 1 makes StarPU automatically call <c>free(callback_arg)</c> when
			
 
				+destroying the task.
			
 
				+
			
 
				+\var starpu_task::prologue_callback_func
			
 
				 Optional field, the default value is <c>NULL</c>. This is a function
			
 
				 pointer of prototype <c>void (*f)(void *)</c> which specifies a
			
 
				 possible callback. 
			
 
				 If this pointer is non-null, the callback function
			
 
				 is executed on the host when the task becomes ready for execution,
			
 
				 before getting scheduled. The callback is passed the
			
 
				-value contained in the starpu_task::prologue_arg field. No callback is
			
 
				+value contained in the starpu_task::prologue_callback_arg field. No callback is
			
 
				 executed if the field is set to NULL.
			
 
				 
			
 
				-\var starpu_task::prologue_arg (optional) (default: NULL)
			
 
				+\var starpu_task::prologue_callback_arg (optional) (default: NULL)
			
 
				 Optional field, the default value is <c>NULL</c>. This is the pointer
			
 
				-passed to the prologue function. This field is ignored if the field
			
 
				-starpu_task::prologue_func is set to <c>NULL</c>.
			
 
				+passed to the prologue callback function. This field is ignored if the field
			
 
				+starpu_task::prologue_callback_func is set to <c>NULL</c>.
			
 
				+
			
 
				+\var starpu_task::prologue_callback_arg_free
			
 
				+Optional field. In case starpu_task::prologue_callback_arg was allocated by the
			
 
				+application through <c>malloc()</c>, setting starpu_task::prologue_callback_arg_free
			
 
				+to 1 makes StarPU automatically call <c>free(prologue_callback_arg)</c> when
			
 
				+destroying the task.
			
 
				 
			
 
				 \var starpu_task::use_tag
			
 
				 Optional field, the default value is 0. If set, this flag indicates
			
--- a/doc/doxygen/chapters/api/insert_task.doxy
+++ b/doc/doxygen/chapters/api/insert_task.doxy
@@ -8,7 +8,11 @@
 
				 
			
 
				 /*! \defgroup API_Insert_Task Insert_Task
			
 
				 
			
 
				-\fn int starpu_insert_task(struct starpu_codelet *cl, ...)
			
 
				+\def starpu_insert_task
			
 
				+\ingroup API_Insert_Task
			
 
				+Convenience macro for the function starpu_task_insert() which used to be called starpu_insert_task.
			
 
				+
			
 
				+\fn int starpu_task_insert(struct starpu_codelet *cl, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Create and submit a task corresponding to \p cl with the
			
 
				 following arguments. The argument list must be zero-terminated.
			
@@ -35,18 +39,18 @@ implementation to retrieve them.
 
				 
			
 
				 \def STARPU_VALUE
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by a pointer to a constant value and the size of the
			
 
				 constant
			
 
				 
			
 
				 \def STARPU_CALLBACK
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by a pointer to a callback function
			
 
				 
			
 
				 \def STARPU_CALLBACK_WITH_ARG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by two pointers: one to a callback function, and the other
			
 
				 to be given as an argument to the callback function; this is
			
 
				 equivalent to using both ::STARPU_CALLBACK and
			
@@ -54,13 +58,13 @@ equivalent to using both ::STARPU_CALLBACK and
 
				 
			
 
				 \def STARPU_CALLBACK_ARG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by a pointer to be given as an argument to the callback
			
 
				 function
			
 
				 
			
 
				 \def STARPU_PRIORITY
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by a integer defining a priority level
			
 
				 
			
 
				 \def STARPU_DATA_ARRAY
			
@@ -69,18 +73,18 @@ TODO
 
				 
			
 
				 \def STARPU_TAG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must be followed by a tag.
			
 
				+this macro is used when calling starpu_task_insert(), and must be followed by a tag.
			
 
				 
			
 
				 \def STARPU_FLOPS
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by an amount of floating point operations, as a double.
			
 
				 Users <b>MUST</b> explicitly cast into double, otherwise parameter
			
 
				 passing will not work.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_insert_task(), and must
			
 
				+this macro is used when calling starpu_task_insert(), and must
			
 
				 be followed by the id of the scheduling context to which we want to
			
 
				 submit the task.
			
 
				 
			
@@ -93,6 +97,13 @@ starpu_codelet_unpack_args().
 
				 \fn void starpu_codelet_unpack_args(void *cl_arg, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Retrieve the arguments of type ::STARPU_VALUE associated to a
			
 
				-task automatically created using the function starpu_insert_task().
			
 
				+task automatically created using the function starpu_task_insert().
			
 
				+
			
 
				+\fn struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
			
 
				+\ingroup API_Insert_Task
			
 
				+Create a task corresponding to \p cl with the following arguments.
			
 
				+The argument list must be zero-terminated. The arguments
			
 
				+following the codelet are the same as the ones for the function
			
 
				+starpu_task_insert().
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -204,23 +204,27 @@ Returns the last value set by starpu_data_set_rank().
 
				 
			
 
				 \def STARPU_EXECUTE_ON_NODE
			
 
				 \ingroup API_MPI_Support
			
 
				-this macro is used when calling starpu_mpi_insert_task(), and must be
			
 
				+this macro is used when calling starpu_mpi_task_insert(), and must be
			
 
				 followed by a integer value which specified the node on which to
			
 
				 execute the codelet.
			
 
				 
			
 
				 \def STARPU_EXECUTE_ON_DATA
			
 
				 \ingroup API_MPI_Support
			
 
				-this macro is used when calling starpu_mpi_insert_task(), and must be
			
 
				+this macro is used when calling starpu_mpi_task_insert(), and must be
			
 
				 followed by a data handle to specify that the node owning the given
			
 
				 data will execute the codelet.
			
 
				 
			
 
				-\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+\def starpu_mpi_insert_task
			
 
				+\ingroup API_MPI_Support
			
 
				+Convenience macro for the function starpu_mpi_task_insert() which used to be called starpu_mpi_insert_task.
			
 
				+
			
 
				+\fn int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				 Create and submit a task corresponding to codelet with the following
			
 
				 arguments. The argument list must be zero-terminated.
			
 
				 
			
 
				 The arguments following the codelets are the same types as for the
			
 
				-function starpu_insert_task(). The extra argument
			
 
				+function starpu_task_insert(). The extra argument
			
 
				 ::STARPU_EXECUTE_ON_NODE followed by an integer allows to specify the
			
 
				 MPI node to execute the codelet. It is also possible to specify that
			
 
				 the node owning a specific data will execute the codelet, by using
			
--- a/doc/doxygen/chapters/configure_options.doxy
+++ b/doc/doxygen/chapters/configure_options.doxy
@@ -22,6 +22,13 @@ the following configure options.
 
				 Enable debugging messages.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-spinlock-check</dt>
			
 
				+<dd>
			
 
				+\anchor enable-spinlock-check
			
 
				+\addindex __configure__--enable-spinlock-check
			
 
				+Enable checking that spinlocks are taken and released properly.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--enable-fast</dt>
			
 
				 <dd>
			
 
				 \anchor enable-fast
			
--- a/doc/doxygen/chapters/mpi_support.doxy
+++ b/doc/doxygen/chapters/mpi_support.doxy
@@ -232,7 +232,7 @@ task, and trigger the required MPI transfers.
 
				 
			
 
				 The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
			
 
				 
			
 
				-Here an stencil example showing how to use starpu_mpi_insert_task(). One
			
 
				+Here an stencil example showing how to use starpu_mpi_task_insert(). One
			
 
				 first needs to define a distribution function which specifies the
			
 
				 locality of the data. Note that that distribution information needs to
			
 
				 be given to StarPU by calling starpu_data_set_rank(). A MPI tag
			
@@ -291,14 +291,14 @@ data which will be needed by the tasks that we will execute.
 
				     }
			
 
				 \endcode
			
 
				 
			
 
				-Now starpu_mpi_insert_task() can be called for the different
			
 
				+Now starpu_mpi_task_insert() can be called for the different
			
 
				 steps of the application.
			
 
				 
			
 
				 \code{.c}
			
 
				     for(loop=0 ; loop<niter; loop++)
			
 
				         for (x = 1; x < X-1; x++)
			
 
				             for (y = 1; y < Y-1; y++)
			
 
				-                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
			
 
				+                starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl,
			
 
				                                        STARPU_RW, data_handles[x][y],
			
 
				                                        STARPU_R, data_handles[x-1][y],
			
 
				                                        STARPU_R, data_handles[x+1][y],
			
@@ -365,7 +365,7 @@ for(x = 0; x < nblocks ;  x++) {
 
				     if (data_handles[x]) {
			
 
				         int owner = starpu_data_get_rank(data_handles[x]);
			
 
				         if (owner == rank) {
			
 
				-            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
			
 
				+            starpu_task_insert(&cl, STARPU_RW, data_handles[x], 0);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -383,9 +383,9 @@ MPI examples are available in the StarPU source code in mpi/examples:
 
				 <ul>
			
 
				 <li><c>complex</c> is a simple example using a user-define data interface over
			
 
				 MPI (complex numbers),
			
 
				-<li><c>stencil5</c> is a simple stencil example using <c>starpu_mpi_insert_task</c>,
			
 
				+<li><c>stencil5</c> is a simple stencil example using starpu_mpi_task_insert(),
			
 
				 <li><c>matrix_decomposition</c> is a cholesky decomposition example using
			
 
				-<c>starpu_mpi_insert_task</c>. The non-distributed version can check for
			
 
				+starpu_mpi_task_insert(). The non-distributed version can check for
			
 
				 <algorithm correctness in 1-node configuration, the distributed version uses
			
 
				 exactly the same source code, to be used over MPI,
			
 
				 <li><c>mpi_lu</c> is an LU decomposition example, provided in three versions:
			
--- a/doc/doxygen/chapters/scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/scheduling_context_hypervisor.doxy
@@ -76,7 +76,7 @@ The <b>Application driven</b> strategy uses the user's input concerning the mome
 
				 Thus, the users tags the task that should trigger the resizing
			
 
				 process. We can set directly the field starpu_task::hypervisor_tag or
			
 
				 use the macro ::STARPU_HYPERVISOR_TAG in the function
			
 
				-starpu_insert_task().
			
 
				+starpu_task_insert().
			
 
				 
			
 
				 \code{.c}
			
 
				 task.hypervisor_tag = 2;
			
@@ -85,7 +85,7 @@ task.hypervisor_tag = 2;
 
				 or
			
 
				 
			
 
				 \code{.c}
			
 
				-starpu_insert_task(&codelet,
			
 
				+starpu_task_insert(&codelet,
			
 
				 		    ...,
			
 
				 		    STARPU_HYPERVISOR_TAG, 2,
			
 
				                     0);
			
@@ -131,7 +131,7 @@ The number of flops to be executed by a context are passed as
 
				  (<c>sc_hypervisor_register_ctx(sched_ctx_id, flops)</c>) and the one
			
 
				  to be executed by each task are passed when the task is submitted.
			
 
				  The corresponding field is starpu_task::flops and the corresponding
			
 
				- macro in the function starpu_insert_task() is ::STARPU_FLOPS
			
 
				+ macro in the function starpu_task_insert() is ::STARPU_FLOPS
			
 
				  (<b>Caution</b>: but take care of passing a double, not an integer,
			
 
				  otherwise parameter passing will be bogus). When the task is executed
			
 
				  the resizing process is triggered.
			
@@ -143,7 +143,7 @@ task.flops = 100;
 
				 or
			
 
				 
			
 
				 \code{.c}
			
 
				-starpu_insert_task(&codelet,
			
 
				+starpu_task_insert(&codelet,
			
 
				                     ...,
			
 
				                     STARPU_FLOPS, (double) 100,
			
 
				                     0);
			
@@ -215,4 +215,4 @@ struct sc_hypervisor_policy dummy_policy =
 
				 \endcode
			
 
				 
			
 
				 
			
 
				-*/
			
 
				+*/
			
--- a/doc/doxygen/chapters/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/scheduling_contexts.doxy
@@ -97,7 +97,7 @@ the current thread will submit tasks to the coresponding context.
 
				 When the application may not assign a thread of submission to each
			
 
				 context, the id of the context must be indicated by using the
			
 
				 function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
			
 
				-for <c>starpu_insert_task</c>.
			
 
				+for starpu_task_insert().
			
 
				 
			
 
				 \section DeletingAContext Deleting A Context
			
 
				 
			
--- a/examples/basic_examples/dynamic_handles.c
+++ b/examples/basic_examples/dynamic_handles.c
@@ -112,12 +112,12 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) goto enodev;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				-	ret = starpu_insert_task(&dummy_small_cl,
			
 
				+	ret = starpu_task_insert(&dummy_small_cl,
			
 
				 				 STARPU_VALUE, &dummy_small_cl.nbuffers, sizeof(dummy_small_cl.nbuffers),
			
 
				 				 STARPU_RW, handle,
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) goto enodev;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				         ret = starpu_task_wait_for_all();
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
@@ -126,12 +126,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		handles[i] = handle;
			
 
				 	}
			
 
				-	ret = starpu_insert_task(&dummy_big_cl,
			
 
				+	ret = starpu_task_insert(&dummy_big_cl,
			
 
				 				 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
			
 
				 				 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) goto enodev;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				         ret = starpu_task_wait_for_all();
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 	free(handles);
			
--- a/examples/binary/binary.c
+++ b/examples/binary/binary.c
@@ -67,7 +67,7 @@ int compute(char *file_name, int load_as_file)
 
				 
			
 
				 	for (i = 0; i < niter; i++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&cl, STARPU_RW, float_array_handle, 0);
			
 
				+		ret = starpu_task_insert(&cl, STARPU_RW, float_array_handle, 0);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 		{
			
 
				 			FPRINTF(stderr, "No worker may execute this task\n");
			
--- a/examples/callback/prologue.c
+++ b/examples/callback/prologue.c
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	double *x = (double*)malloc(sizeof(double));
			
 
				 	*x = -999.0;
			
 
				-	int ret2 = starpu_insert_task(&cl,
			
 
				+	int ret2 = starpu_task_insert(&cl,
			
 
				 				      STARPU_RW, handle,
			
 
				 				      STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
			
 
				 				      STARPU_PROLOGUE_CALLBACK_ARG, x,
			
@@ -92,6 +92,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	FPRINTF(stderr, "v -> %d\n", v);
			
 
				 
			
 
				+	free(x);
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -288,20 +288,20 @@ int dot_kernel(starpu_data_handle_t v1,
 
				 	if (use_reduction)
			
 
				 		starpu_data_invalidate_submit(s);
			
 
				 	else {
			
 
				-		ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	}
			
 
				 
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&dot_kernel_cl,
			
 
				+		ret = starpu_task_insert(&dot_kernel_cl,
			
 
				 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				 					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				 					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				 					 0);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
@@ -442,12 +442,12 @@ int gemv_kernel(starpu_data_handle_t v1,
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&scal_kernel_cl,
			
 
				+		ret = starpu_task_insert(&scal_kernel_cl,
			
 
				 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	}
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
@@ -455,14 +455,14 @@ int gemv_kernel(starpu_data_handle_t v1,
 
				 		for (b1 = 0; b1 < nblocks; b1++)
			
 
				 		{
			
 
				 			TYPE one = 1.0;
			
 
				-			ret = starpu_insert_task(&gemv_kernel_cl,
			
 
				+			ret = starpu_task_insert(&gemv_kernel_cl,
			
 
				 						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				 						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				 						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				 						 STARPU_VALUE,	&one,	sizeof(one),
			
 
				 						 STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				 						 0);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 		}
			
 
				 	}
			
 
				 	return 0;
			
@@ -535,14 +535,14 @@ int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&scal_axpy_kernel_cl,
			
 
				+		ret = starpu_task_insert(&scal_axpy_kernel_cl,
			
 
				 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 STARPU_VALUE, &p2, sizeof(p2),
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
@@ -609,13 +609,13 @@ int axpy_kernel(starpu_data_handle_t v1,
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&axpy_kernel_cl,
			
 
				+		ret = starpu_task_insert(&axpy_kernel_cl,
			
 
				 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -99,27 +99,27 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	{
			
 
				                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				-                ret = starpu_insert_task(&cl11,
			
 
				+                ret = starpu_task_insert(&cl11,
			
 
				 					 STARPU_PRIORITY, prio_level,
			
 
				 					 STARPU_RW, sdatakk,
			
 
				 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return 77;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-                        ret = starpu_insert_task(&cl21,
			
 
				+                        ret = starpu_task_insert(&cl21,
			
 
				 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 						 STARPU_R, sdatakk,
			
 
				 						 STARPU_RW, sdatakj,
			
 
				 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 						 0);
			
 
				 			if (ret == -ENODEV) return 77;
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
@@ -128,7 +128,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				 					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				 
			
 
				-					ret = starpu_insert_task(&cl22,
			
 
				+					ret = starpu_task_insert(&cl22,
			
 
				 								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 								 STARPU_R, sdataki,
			
 
				 								 STARPU_R, sdatakj,
			
@@ -136,7 +136,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 								 0);
			
 
				 					if (ret == -ENODEV) return 77;
			
 
				-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				                                 }
			
 
				 			}
			
 
				 		}
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	for (i = 0; i < niter; i++)
			
 
				 	{
			
 
				-		ret = starpu_insert_task(&cl,
			
 
				+		ret = starpu_task_insert(&cl,
			
 
				 					 STARPU_RW, float_array_handle,
			
 
				 					 0);
			
 
				                 if (STARPU_UNLIKELY(ret == -ENODEV))
			
--- a/examples/interface/complex.c
+++ b/examples/interface/complex.c
@@ -18,8 +18,6 @@
 
				 #include "complex_interface.h"
			
 
				 #include "complex_codelet.h"
			
 
				 
			
 
				-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				-
			
 
				 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
			
@@ -95,21 +93,21 @@ int main(int argc, char **argv)
 
				 	starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
			
 
				 	starpu_complex_data_register(&handle2, STARPU_MAIN_RAM, &copy_real, &copy_imaginary, 1);
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
			
 
				+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
			
 
				+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_compare,
			
 
				+	ret = starpu_task_insert(&cl_compare,
			
 
				 				 STARPU_R, handle1,
			
 
				 				 STARPU_R, handle2,
			
 
				 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 	starpu_task_wait_for_all();
			
 
				 	if (compare != 0)
			
 
				 	{
			
@@ -117,28 +115,28 @@ int main(int argc, char **argv)
 
				 	     goto end;
			
 
				 	}
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_copy,
			
 
				+	ret = starpu_task_insert(&cl_copy,
			
 
				 				 STARPU_R, handle1,
			
 
				 				 STARPU_W, handle2,
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
			
 
				+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
			
 
				+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				-	ret = starpu_insert_task(&cl_compare,
			
 
				+	ret = starpu_task_insert(&cl_compare,
			
 
				 				 STARPU_R, handle1,
			
 
				 				 STARPU_R, handle2,
			
 
				 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
			
 
				 				 0);
			
 
				 	if (ret == -ENODEV) goto end;
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
--- a/examples/interface/complex_codelet.h
+++ b/examples/interface/complex_codelet.h
@@ -20,6 +20,8 @@
 
				 #ifndef __COMPLEX_CODELET_H
			
 
				 #define __COMPLEX_CODELET_H
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 void compare_complex_codelet(void *descr[], void *_args)
			
 
				 {
			
 
				 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
			
@@ -70,7 +72,7 @@ void display_complex_codelet(void *descr[], void *_args)
 
				 
			
 
				 	for(i=0 ; i<nx ; i++)
			
 
				 	{
			
 
				-		fprintf(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
			
 
				+		FPRINTF(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -529,7 +529,7 @@ int main(int argc, char **argv)
 
				 			per_block_cnt[iby] = 0;
			
 
				 			int *pcnt = &per_block_cnt[iby];
			
 
				 
			
 
				-			ret = starpu_insert_task(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
			
 
				+			ret = starpu_task_insert(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
			
 
				 						 STARPU_VALUE, &iby, sizeof(iby),
			
 
				 						 STARPU_VALUE, &block_size, sizeof(block_size),
			
 
				 						 STARPU_VALUE, &stepX, sizeof(stepX),
			
@@ -537,7 +537,7 @@ int main(int argc, char **argv)
 
				 						 STARPU_W, block_handles[iby],
			
 
				 						 STARPU_VALUE, &pcnt, sizeof(int *),
			
 
				 						 0);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 		}
			
 
				 
			
 
				 		for (iby = 0; iby < nblocks; iby++)
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -200,33 +200,33 @@ int main(void)
 
				 			sem_wait(&sems[l%C]);
			
 
				 
			
 
				 		/* Now submit the next stage */
			
 
				-		ret = starpu_insert_task(&pipeline_codelet_x,
			
 
				+		ret = starpu_task_insert(&pipeline_codelet_x,
			
 
				 				STARPU_W, buffersX[l%K],
			
 
				 				STARPU_VALUE, &x, sizeof(x),
			
 
				 				0);
			
 
				 		if (ret == -ENODEV) goto enodev;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task x");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert x");
			
 
				 
			
 
				-		ret = starpu_insert_task(&pipeline_codelet_x,
			
 
				+		ret = starpu_task_insert(&pipeline_codelet_x,
			
 
				 				STARPU_W, buffersY[l%K],
			
 
				 				STARPU_VALUE, &y, sizeof(y),
			
 
				 				0);
			
 
				 		if (ret == -ENODEV) goto enodev;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task y");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert y");
			
 
				 
			
 
				-		ret = starpu_insert_task(&pipeline_codelet_axpy,
			
 
				+		ret = starpu_task_insert(&pipeline_codelet_axpy,
			
 
				 				STARPU_R, buffersX[l%K],
			
 
				 				STARPU_RW, buffersY[l%K],
			
 
				 				0);
			
 
				 		if (ret == -ENODEV) goto enodev;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task axpy");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert axpy");
			
 
				 
			
 
				-		ret = starpu_insert_task(&pipeline_codelet_sum,
			
 
				+		ret = starpu_task_insert(&pipeline_codelet_sum,
			
 
				 				STARPU_R, buffersY[l%K],
			
 
				 				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
			
 
				 				0);
			
 
				 		if (ret == -ENODEV) goto enodev;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task sum");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert sum");
			
 
				 	}
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
--- a/gcc-plugin/src/tasks.c
+++ b/gcc-plugin/src/tasks.c
@@ -525,7 +525,7 @@ declare_codelet (tree task_decl)
 
				   return cl_decl;
			
 
				 }
			
 
				 
			
 
				-/* Build the body of TASK_DECL, which will call `starpu_insert_task'.  */
			
 
				+/* Build the body of TASK_DECL, which will call `starpu_task_insert'.  */
			
 
				 
			
 
				 void
			
 
				 define_task (tree task_decl)
			
@@ -583,22 +583,22 @@ define_task (tree task_decl)
 
				   /* Introduce a local variable to hold the error code.  */
			
 
				 
			
 
				   tree error_var = build_decl (loc, VAR_DECL,
			
 
				-  			       create_tmp_var_name (".insert_task_error"),
			
 
				+  			       create_tmp_var_name (".task_insert_error"),
			
 
				   			       integer_type_node);
			
 
				   DECL_CONTEXT (error_var) = task_decl;
			
 
				   DECL_ARTIFICIAL (error_var) = true;
			
 
				 
			
 
				   /* Build this:
			
 
				 
			
 
				-       err = starpu_insert_task (...);
			
 
				+       err = starpu_task_insert (...);
			
 
				        if (err != 0)
			
 
				          { printf ...; abort (); }
			
 
				    */
			
 
				 
			
 
				-  static tree insert_task_fn;
			
 
				-  LOOKUP_STARPU_FUNCTION (insert_task_fn, "starpu_insert_task");
			
 
				+  static tree task_insert_fn;
			
 
				+  LOOKUP_STARPU_FUNCTION (task_insert_fn, "starpu_task_insert");
			
 
				 
			
 
				-  tree call = build_call_expr_loc_vec (loc, insert_task_fn, args);
			
 
				+  tree call = build_call_expr_loc_vec (loc, task_insert_fn, args);
			
 
				 
			
 
				   tree assignment = build2 (INIT_EXPR, TREE_TYPE (error_var),
			
 
				   			    error_var, call);
			
--- a/gcc-plugin/tests/base.c
+++ b/gcc-plugin/tests/base.c
@@ -106,7 +106,7 @@ main (int argc, char *argv[])
 
				   unsigned char y = 77;
			
 
				   long y_as_long_int = 77;
			
 
				 
			
 
				-  struct insert_task_argument expected[] =
			
 
				+  struct task_insert_argument expected[] =
			
 
				     {
			
 
				       { STARPU_VALUE, &x, sizeof x },
			
 
				       { STARPU_VALUE, &y, sizeof y },
			
@@ -114,7 +114,7 @@ main (int argc, char *argv[])
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected;
			
 
				+  expected_task_insert_arguments = expected;
			
 
				 
			
 
				   /* Invoke the task, which should make sure it gets called with
			
 
				      EXPECTED.  */
			
@@ -135,14 +135,14 @@ main (int argc, char *argv[])
 
				 
			
 
				   assert (tasks_submitted == 9);
			
 
				 
			
 
				-  struct insert_task_argument expected2[] =
			
 
				+  struct task_insert_argument expected2[] =
			
 
				     {
			
 
				       { STARPU_VALUE, &x, sizeof x },
			
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				   tasks_submitted = 0;
			
 
				-  expected_insert_task_arguments = expected2;
			
 
				+  expected_task_insert_arguments = expected2;
			
 
				 
			
 
				   my_other_task (42);
			
 
				   assert (tasks_submitted == 1);
			
--- a/gcc-plugin/tests/lib-user.c
+++ b/gcc-plugin/tests/lib-user.c
@@ -38,7 +38,7 @@ main (int argc, char *argv[])
 
				   static const char forty_two = 42;
			
 
				   static const int  sizeof_x = sizeof x;
			
 
				 
			
 
				-  struct insert_task_argument expected_pointer_task[] =
			
 
				+  struct task_insert_argument expected_pointer_task[] =
			
 
				     {
			
 
				       { STARPU_VALUE, &forty_two, sizeof forty_two },
			
 
				       { STARPU_R,  x },
			
@@ -47,7 +47,7 @@ main (int argc, char *argv[])
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected_pointer_task;
			
 
				+  expected_task_insert_arguments = expected_pointer_task;
			
 
				 
			
 
				   expected_register_arguments.pointer = (void *) x;
			
 
				   expected_register_arguments.elements = sizeof x / sizeof x[0];
			
--- a/gcc-plugin/tests/mocks.h
+++ b/gcc-plugin/tests/mocks.h
@@ -62,7 +62,7 @@ typedef double         cl_double;
 
				 /* Number of tasks submitted.  */
			
 
				 static unsigned int tasks_submitted;
			
 
				 
			
 
				-struct insert_task_argument
			
 
				+struct task_insert_argument
			
 
				 {
			
 
				   /* `STARPU_VALUE', etc. */
			
 
				   int type;
			
@@ -75,18 +75,18 @@ struct insert_task_argument
 
				 };
			
 
				 
			
 
				 /* Pointer to a zero-terminated array listing the expected
			
 
				-   `starpu_insert_task' arguments.  */
			
 
				-const struct insert_task_argument *expected_insert_task_arguments;
			
 
				+   `starpu_task_insert' arguments.  */
			
 
				+const struct task_insert_argument *expected_task_insert_arguments;
			
 
				 
			
 
				 /* Expected targets of the codelets submitted.  */
			
 
				-static int expected_insert_task_targets = STARPU_CPU | STARPU_OPENCL;
			
 
				+static int expected_task_insert_targets = STARPU_CPU | STARPU_OPENCL;
			
 
				 
			
 
				 
			
 
				 int
			
 
				-starpu_insert_task (struct starpu_codelet *cl, ...)
			
 
				+starpu_task_insert (struct starpu_codelet *cl, ...)
			
 
				 {
			
 
				   assert (cl->name != NULL && strlen (cl->name) > 0);
			
 
				-  assert (cl->where == expected_insert_task_targets);
			
 
				+  assert (cl->where == expected_task_insert_targets);
			
 
				 
			
 
				   assert ((cl->where & STARPU_CPU) == 0
			
 
				 	  ? cl->cpu_funcs[0] == NULL
			
@@ -106,8 +106,8 @@ starpu_insert_task (struct starpu_codelet *cl, ...)
 
				 
			
 
				   va_start (args, cl);
			
 
				 
			
 
				-  const struct insert_task_argument *expected;
			
 
				-  for (expected = expected_insert_task_arguments,
			
 
				+  const struct task_insert_argument *expected;
			
 
				+  for (expected = expected_task_insert_arguments,
			
 
				 	 cl_args_offset = 1, scalars = 0, pointers = 0;
			
 
				        expected->type != 0;
			
 
				        expected++)
			
@@ -528,9 +528,9 @@ clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
 
				 		const void *value)
			
 
				 {
			
 
				   size_t n;
			
 
				-  const struct insert_task_argument *arg;
			
 
				+  const struct task_insert_argument *arg;
			
 
				 
			
 
				-  for (n = 0, arg = expected_insert_task_arguments;
			
 
				+  for (n = 0, arg = expected_task_insert_arguments;
			
 
				        n < index;
			
 
				        n++, arg++)
			
 
				     assert (arg->pointer != NULL);
			
--- a/gcc-plugin/tests/opencl.c
+++ b/gcc-plugin/tests/opencl.c
@@ -46,15 +46,15 @@ main ()
 
				 #pragma starpu register a
			
 
				 
			
 
				   static int x = 123;
			
 
				-  struct insert_task_argument expected[] =
			
 
				+  struct task_insert_argument expected[] =
			
 
				     {
			
 
				       { STARPU_VALUE, &x, sizeof x },
			
 
				       { STARPU_RW, a },
			
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected;
			
 
				-  expected_insert_task_targets = STARPU_OPENCL;
			
 
				+  expected_task_insert_arguments = expected;
			
 
				+  expected_task_insert_targets = STARPU_OPENCL;
			
 
				   size_t y = 8; expected_cl_enqueue_kernel_arguments.global_work_size = &y;
			
 
				 
			
 
				   my_task (123, a);
			
--- a/gcc-plugin/tests/output-pointer.c
+++ b/gcc-plugin/tests/output-pointer.c
@@ -84,14 +84,14 @@ main (int argc, char *argv[])
 
				   expected_register_arguments.element_size = sizeof x[0];
			
 
				   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) x, 42, sizeof x[0]);
			
 
				 
			
 
				-  struct insert_task_argument expected[] =
			
 
				+  struct task_insert_argument expected[] =
			
 
				     {
			
 
				       { STARPU_VALUE, &size, sizeof size },
			
 
				       { STARPU_W, x },
			
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected;
			
 
				+  expected_task_insert_arguments = expected;
			
 
				 
			
 
				   /* Invoke the task, which makes sure it gets called with EXPECTED.  */
			
 
				   my_pointer_task (size, x);
			
--- a/gcc-plugin/tests/pointers.c
+++ b/gcc-plugin/tests/pointers.c
@@ -92,14 +92,14 @@ main (int argc, char *argv[])
 
				   expected_register_arguments.element_size = sizeof *y;
			
 
				   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) y, 1, sizeof *y);
			
 
				 
			
 
				-  struct insert_task_argument expected_pointer_task[] =
			
 
				+  struct task_insert_argument expected_pointer_task[] =
			
 
				     {
			
 
				       { STARPU_R,  x },
			
 
				       { STARPU_RW, y },
			
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected_pointer_task;
			
 
				+  expected_task_insert_arguments = expected_pointer_task;
			
 
				 
			
 
				   /* Invoke the task, which should make sure it gets called with
			
 
				      EXPECTED.  */
			
@@ -110,7 +110,7 @@ main (int argc, char *argv[])
 
				 
			
 
				   /* Likewise with `my_mixed_task'.  */
			
 
				 
			
 
				-  struct insert_task_argument expected_mixed_task[] =
			
 
				+  struct task_insert_argument expected_mixed_task[] =
			
 
				     {
			
 
				       { STARPU_RW, x },
			
 
				       { STARPU_VALUE, &z, sizeof z },
			
@@ -118,7 +118,7 @@ main (int argc, char *argv[])
 
				       { 0, 0, 0 }
			
 
				     };
			
 
				 
			
 
				-  expected_insert_task_arguments = expected_mixed_task;
			
 
				+  expected_task_insert_arguments = expected_mixed_task;
			
 
				 
			
 
				   my_mixed_task (x, 0x77, y);
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -125,9 +125,13 @@ struct starpu_task
 
				 
			
 
				 	void (*callback_func)(void *);
			
 
				 	void *callback_arg;
			
 
				+	/* must StarPU release callback_arg ? - 0 by default */
			
 
				+	unsigned callback_arg_free;
			
 
				 
			
 
				 	void (*prologue_callback_func)(void *);
			
 
				 	void *prologue_callback_arg;
			
 
				+	/* must StarPU release prologue_callback_arg ? - 0 by default */
			
 
				+	unsigned prologue_callback_arg_free;
			
 
				 
			
 
				 	unsigned use_tag;
			
 
				 	starpu_tag_t tag_id;
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -46,7 +46,9 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
				 #define STARPU_PROLOGUE_CALLBACK   (13<<16)
			
 
				 #define STARPU_PROLOGUE_CALLBACK_ARG (14<<16)
			
 
				 
			
 
				-int starpu_insert_task(struct starpu_codelet *cl, ...);
			
 
				+struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
			
 
				+int starpu_task_insert(struct starpu_codelet *cl, ...);
			
 
				+#define starpu_insert_task starpu_task_insert
			
 
				 
			
 
				 void starpu_codelet_unpack_args(void *cl_arg, ...);
			
 
				 
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -198,14 +198,6 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 
				 #define STARPU_WMB() STARPU_SYNCHRONIZE()
			
 
				 #endif
			
 
				 
			
 
				-/* This is needed in some places to make valgrind yield to another thread to be
			
 
				- * able to progress.  */
			
 
				-#if defined(__i386__) || defined(__x86_64__)
			
 
				-#define STARPU_UYIELD() __asm__ __volatile("rep; nop")
			
 
				-#else
			
 
				-#define STARPU_UYIELD() ((void)0)
			
 
				-#endif
			
 
				-
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/examples/complex/mpi_complex.c
+++ b/mpi/examples/complex/mpi_complex.c
@@ -18,10 +18,12 @@
 
				 #include <interface/complex_interface.h>
			
 
				 #include <interface/complex_codelet.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-	fprintf(stderr, "foo = %d\n", *foo);
			
 
				+	FPRINTF(stderr, "foo = %d\n", *foo);
			
 
				 }
			
 
				 
			
 
				 struct starpu_codelet foo_display =
			
@@ -75,17 +77,17 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			int *compare_ptr = &compare;
			
 
				 
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value"), STARPU_R, handle, 0);
			
 
				+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value"), STARPU_R, handle, 0);
			
 
				 			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				 			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value"), STARPU_R, handle2, 0);
			
 
				-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				+			starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value"), STARPU_R, handle2, 0);
			
 
				+			starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				 		}
			
 
				 		else if (rank == 1)
			
 
				 		{
			
 
				 			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value"), STARPU_R, handle, 0);
			
 
				+			starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value"), STARPU_R, handle, 0);
			
 
				 			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 		}
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky.c
@@ -64,8 +64,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -112,20 +112,20 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		int prio = STARPU_DEFAULT_PRIO;
			
 
				 		if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				 
			
 
				-		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
			
 
				-				STARPU_PRIORITY, prio,
			
 
				-				STARPU_RW, data_handles[k][k],
			
 
				-				0);
			
 
				+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+				       STARPU_PRIORITY, prio,
			
 
				+				       STARPU_RW, data_handles[k][k],
			
 
				+				       0);
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				 			prio = STARPU_DEFAULT_PRIO;
			
 
				 			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				-			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
			
 
				-					STARPU_PRIORITY, prio,
			
 
				-					STARPU_R, data_handles[k][k],
			
 
				-					STARPU_RW, data_handles[k][j],
			
 
				-					0);
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, prio,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[k][j],
			
 
				+					       0);
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
@@ -133,12 +133,12 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 				{
			
 
				 					prio = STARPU_DEFAULT_PRIO;
			
 
				 					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				-					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
			
 
				-							STARPU_PRIORITY, prio,
			
 
				-							STARPU_R, data_handles[k][i],
			
 
				-							STARPU_R, data_handles[k][j],
			
 
				-							STARPU_RW, data_handles[i][j],
			
 
				-							0);
			
 
				+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+							       STARPU_PRIORITY, prio,
			
 
				+							       STARPU_R, data_handles[k][i],
			
 
				+							       STARPU_R, data_handles[k][j],
			
 
				+							       STARPU_RW, data_handles[i][j],
			
 
				+							       0);
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -186,7 +186,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
			
 
				+	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
@@ -203,7 +203,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 	SSYRK("L", "N", size, size, 1.0f,
			
 
				 			rmat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				-	fprintf(stderr, "[%d] comparing results ...\n", rank);
			
 
				+	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
			
 
				 	if (display)
			
 
				 	{
			
 
				 		for (j = 0; j < size; j++)
			
@@ -241,7 +241,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 							float err = abs(test_mat[j +i*size] - orig);
			
 
				 							if (err > 0.00001)
			
 
				 							{
			
 
				-								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
			
 
				+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
			
 
				 								*correctness = 0;
			
 
				 								*flops = 0;
			
 
				 								break;
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
@@ -22,6 +22,8 @@
 
				 #include "mpi_decomposition_matrix.h"
			
 
				 #include "mpi_decomposition_params.h"
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	/* create a simple definite positive symetric matrix example
			
@@ -56,8 +58,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_models.h
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_models.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 extern struct starpu_perfmodel chol_model_11;
			
 
				 extern struct starpu_perfmodel chol_model_21;
			
 
				 extern struct starpu_perfmodel chol_model_22;
			
--- a/mpi/examples/mpi_lu/pxlu_implicit.c
+++ b/mpi/examples/mpi_lu/pxlu_implicit.c
@@ -39,15 +39,15 @@ struct callback_arg {
 
				 
			
 
				 static void create_task_11(unsigned k)
			
 
				 {
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			&STARPU_PLU(cl11),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_PRIORITY, !no_prio ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       &STARPU_PLU(cl11),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_PRIORITY, !no_prio ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -57,17 +57,17 @@ static void create_task_11(unsigned k)
 
				 static void create_task_12(unsigned k, unsigned j)
			
 
				 {
			
 
				 #warning temporary fix 
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			//&STARPU_PLU(cl12),
			
 
				-			&STARPU_PLU(cl21),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
			
 
				-			STARPU_PRIORITY, !no_prio && (j == k+1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       //&STARPU_PLU(cl12),
			
 
				+			       &STARPU_PLU(cl21),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
			
 
				+			       STARPU_PRIORITY, !no_prio && (j == k+1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -77,17 +77,17 @@ static void create_task_12(unsigned k, unsigned j)
 
				 static void create_task_21(unsigned k, unsigned i)
			
 
				 {
			
 
				 #warning temporary fix 
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			//&STARPU_PLU(cl21),
			
 
				-			&STARPU_PLU(cl12),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
			
 
				-			STARPU_PRIORITY, !no_prio && (i == k+1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       //&STARPU_PLU(cl21),
			
 
				+			       &STARPU_PLU(cl12),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
			
 
				+			       STARPU_PRIORITY, !no_prio && (i == k+1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -96,17 +96,17 @@ static void create_task_21(unsigned k, unsigned i)
 
				 
			
 
				 static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			&STARPU_PLU(cl22),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, j),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(i, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
			
 
				-			STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       &STARPU_PLU(cl22),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, j),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(i, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
			
 
				+			       STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -17,6 +17,8 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	unsigned *xy = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
@@ -25,7 +27,7 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
			
 
				 	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
			
 
				 
			
 
				-	//fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				+	//FPRINTF(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				 	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
			
 
				 }
			
 
				 
			
@@ -107,14 +109,14 @@ int main(int argc, char **argv)
 
				 			int mpi_rank = my_distrib(x, y, size);
			
 
				 			if (mpi_rank == my_rank)
			
 
				 			{
			
 
				-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
			
 
				+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				 			}
			
 
				 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				 			{
			
 
				 				/* I don't own that index, but will need it for my computations */
			
 
				-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				 			}
			
 
				 			else
			
@@ -136,14 +138,14 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
 
				 			{
			
 
				-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
			
 
				 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
			
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	fprintf(stderr, "Waiting ...\n");
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	for(x = 0; x < X; x++)
			
@@ -162,15 +164,15 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (display)
			
 
				 	{
			
 
				-		fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
			
 
				+		FPRINTF(stdout, "[%d] mean=%d\n", my_rank, mean);
			
 
				 		for(x = 0; x < X; x++)
			
 
				 		{
			
 
				-			fprintf(stdout, "[%d] ", my_rank);
			
 
				+			FPRINTF(stdout, "[%d] ", my_rank);
			
 
				 			for (y = 0; y < Y; y++)
			
 
				 			{
			
 
				-				fprintf(stdout, "%3u ", matrix[x][y]);
			
 
				+				FPRINTF(stdout, "%3u ", matrix[x][y]);
			
 
				 			}
			
 
				-			fprintf(stdout, "\n");
			
 
				+			FPRINTF(stdout, "\n");
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -47,7 +47,8 @@ int starpu_mpi_initialize(void) STARPU_DEPRECATED;
 
				 int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
			
 
				 int starpu_mpi_shutdown(void);
			
 
				 
			
 
				-int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+#define starpu_mpi_insert_task starpu_mpi_task_insert
			
 
				 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
			
 
				 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
			
 
				 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
			
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -36,14 +36,14 @@ noinst_HEADERS =					\
 
				 	starpu_mpi_private.h				\
			
 
				 	starpu_mpi_fxt.h				\
			
 
				 	starpu_mpi_stats.h				\
			
 
				-	starpu_mpi_insert_task.h			\
			
 
				+	starpu_mpi_task_insert.h			\
			
 
				 	starpu_mpi_datatype.h
			
 
				 
			
 
				 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
			
 
				 	starpu_mpi.c					\
			
 
				 	starpu_mpi_helper.c				\
			
 
				 	starpu_mpi_datatype.c				\
			
 
				-	starpu_mpi_insert_task.c			\
			
 
				+	starpu_mpi_task_insert.c			\
			
 
				 	starpu_mpi_collective.c				\
			
 
				 	starpu_mpi_stats.c				\
			
 
				 	starpu_mpi_private.c
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -21,7 +21,7 @@
 
				 #include <starpu_mpi_private.h>
			
 
				 #include <starpu_profiling.h>
			
 
				 #include <starpu_mpi_stats.h>
			
 
				-#include <starpu_mpi_insert_task.h>
			
 
				+#include <starpu_mpi_task_insert.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/thread.h>
			
 
				 
			
@@ -38,7 +38,7 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 
				 							int source, int mpi_tag, MPI_Comm comm,
			
 
				 							unsigned detached, void (*callback)(void *), void *arg,
			
 
				 							int sequential_consistency, int is_internal_req,
			
 
				-							ssize_t psize);
			
 
				+							ssize_t count);
			
 
				 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
			
 
				 
			
 
				 /* The list of requests that have been newly submitted by the application */
			
@@ -62,31 +62,43 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
				 
			
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 
			
 
				-struct _starpu_mpi_copy_handle
			
 
				+LIST_TYPE(_starpu_mpi_copy_handle,
			
 
				+	  starpu_data_handle_t handle;
			
 
				+	  struct _starpu_mpi_envelope *env;
			
 
				+	  struct _starpu_mpi_req *req;
			
 
				+	  void *buffer;
			
 
				+	  int mpi_tag;
			
 
				+	  int source;
			
 
				+	  int req_ready;
			
 
				+	  starpu_pthread_mutex_t req_mutex;
			
 
				+	  starpu_pthread_cond_t req_cond;
			
 
				+);
			
 
				+
			
 
				+struct _starpu_mpi_copy_handle_hashlist
			
 
				 {
			
 
				-	starpu_data_handle_t handle;
			
 
				-	struct _starpu_mpi_envelope *env;
			
 
				-	int mpi_tag;
			
 
				+	struct _starpu_mpi_copy_handle_list *list;
			
 
				 	UT_hash_handle hh;
			
 
				-	struct _starpu_mpi_req *req;
			
 
				+	int mpi_tag;
			
 
				 };
			
 
				 
			
 
				- /********************************************************/
			
 
				- /*                                                      */
			
 
				- /*  Hashmap's requests functionalities                  */
			
 
				- /*                                                      */
			
 
				- /********************************************************/
			
 
				+/********************************************************/
			
 
				+/*                                                      */
			
 
				+/*  Hashmap's requests functionalities                  */
			
 
				+/*                                                      */
			
 
				+/********************************************************/
			
 
				 
			
 
				 /** stores application requests for which data have not been received yet */
			
 
				-static struct _starpu_mpi_req *_starpu_mpi_app_req_hashmap = NULL;
			
 
				+static struct _starpu_mpi_req **_starpu_mpi_app_req_hashmap = NULL;
			
 
				+static int _starpu_mpi_app_req_hashmap_count = 0;
			
 
				 /** stores data which have been received by MPI but have not been requested by the application */
			
 
				-static struct _starpu_mpi_copy_handle *_starpu_mpi_copy_handle_hashmap = NULL;
			
 
				+static struct _starpu_mpi_copy_handle_hashlist **_starpu_mpi_copy_handle_hashmap = NULL;
			
 
				+static int _starpu_mpi_copy_handle_hashmap_count = 0;
			
 
				 
			
 
				-static struct _starpu_mpi_req* find_app_req(int mpi_tag)
			
 
				+static struct _starpu_mpi_req* find_app_req(int mpi_tag, int source)
			
 
				 {
			
 
				 	struct _starpu_mpi_req* req;
			
 
				 
			
 
				-	HASH_FIND_INT(_starpu_mpi_app_req_hashmap, &mpi_tag, req);
			
 
				+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap[source], &mpi_tag, req);
			
 
				 
			
 
				 	return req;
			
 
				 }
			
@@ -95,24 +107,25 @@ static void add_app_req(struct _starpu_mpi_req *req)
 
				 {
			
 
				 	struct _starpu_mpi_req *test_req;
			
 
				 
			
 
				-	test_req = find_app_req(req->mpi_tag);
			
 
				+	test_req = find_app_req(req->mpi_tag, req->srcdst);
			
 
				 
			
 
				 	if (test_req == NULL)
			
 
				 	{
			
 
				-		HASH_ADD_INT(_starpu_mpi_app_req_hashmap, mpi_tag, req);
			
 
				-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap. \n", req, req->mpi_tag);
			
 
				+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap[req->srcdst], mpi_tag, req);
			
 
				+		_starpu_mpi_app_req_hashmap_count ++;
			
 
				+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap. \n", req, req->mpi_tag);
			
 
				+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				 		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
			
 
				 		if (seq_const &&  req->sequential_consistency)
			
 
				 		{
			
 
				-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
			
 
				+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, test_req);
			
 
				+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap[%d], while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, req->srcdst, test_req);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -121,61 +134,115 @@ static void delete_app_req(struct _starpu_mpi_req *req)
 
				 {
			
 
				 	struct _starpu_mpi_req *test_req;
			
 
				 
			
 
				-	test_req = find_app_req(req->mpi_tag);
			
 
				+	test_req = find_app_req(req->mpi_tag, req->srcdst);
			
 
				 
			
 
				 	if (test_req != NULL)
			
 
				 	{
			
 
				-		HASH_DEL(_starpu_mpi_app_req_hashmap, req);
			
 
				-		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap. \n", req, req->mpi_tag);
			
 
				+		HASH_DEL(_starpu_mpi_app_req_hashmap[req->srcdst], req);
			
 
				+		_starpu_mpi_app_req_hashmap_count --;
			
 
				+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap. \n", req, req->mpi_tag);
			
 
				+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap[%d]\n", req, req->mpi_tag, req->srcdst);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static struct _starpu_mpi_copy_handle* find_chandle(int mpi_tag)
			
 
				+static void _starpu_mpi_copy_handle_display_hash(int source, int tag)
			
 
				 {
			
 
				-	struct _starpu_mpi_copy_handle* chandle;
			
 
				-
			
 
				-	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap, &mpi_tag, chandle);
			
 
				+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &tag, hashlist);
			
 
				 
			
 
				-	return chandle;
			
 
				+	if (hashlist == NULL)
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d does not exist\n", source, tag);
			
 
				+	}
			
 
				+	else if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for source %d and tag %d is empty\n", source, tag);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		struct _starpu_mpi_copy_handle *cur;
			
 
				+		for (cur = _starpu_mpi_copy_handle_list_begin(hashlist->list) ;
			
 
				+		     cur != _starpu_mpi_copy_handle_list_end(hashlist->list);
			
 
				+		     cur = _starpu_mpi_copy_handle_list_next(cur))
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(60, "Element for source %d and tag %d: %p\n", source, tag, cur);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				+static struct _starpu_mpi_copy_handle *pop_chandle(int mpi_tag, int source, int delete)
			
 
				 {
			
 
				-	struct _starpu_mpi_copy_handle *test_chandle;
			
 
				+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				+	struct _starpu_mpi_copy_handle *chandle;
			
 
				 
			
 
				-	test_chandle = find_chandle(chandle->mpi_tag);
			
 
				-
			
 
				-	if (test_chandle == NULL)
			
 
				+	_STARPU_MPI_DEBUG(60, "Looking for chandle with tag %d in the hashmap[%d]\n", mpi_tag, source);
			
 
				+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[source], &mpi_tag, hashlist);
			
 
				+	if (hashlist == NULL)
			
 
				 	{
			
 
				-		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap, mpi_tag, chandle);
			
 
				-		_STARPU_MPI_DEBUG(3, "Adding copied handle %p with tag %d in the hashmap. \n", chandle, chandle->mpi_tag);
			
 
				+		chandle = NULL;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(3, "Error add_chandle : copied handle %p with tag %d already in the hashmap. \n", chandle, chandle->mpi_tag);
			
 
				-		STARPU_ASSERT(test_chandle != NULL);
			
 
				+		if (_starpu_mpi_copy_handle_list_empty(hashlist->list))
			
 
				+		{
			
 
				+			chandle = NULL;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (delete == 1)
			
 
				+			{
			
 
				+				chandle = _starpu_mpi_copy_handle_list_pop_front(hashlist->list);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				chandle = _starpu_mpi_copy_handle_list_front(hashlist->list);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				+	_STARPU_MPI_DEBUG(60, "Found chandle %p with tag %d in the hashmap[%d]\n", chandle, mpi_tag, source);
			
 
				+	return chandle;
			
 
				 }
			
 
				 
			
 
				-static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				+static struct _starpu_mpi_copy_handle *find_chandle(int mpi_tag, int source)
			
 
				 {
			
 
				-	struct _starpu_mpi_copy_handle *test_chandle;
			
 
				+	return pop_chandle(mpi_tag, source, 0);
			
 
				+}
			
 
				 
			
 
				-	test_chandle = find_chandle(chandle->mpi_tag);
			
 
				+static void add_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				+{
			
 
				+	_STARPU_MPI_DEBUG(60, "Trying to add chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				 
			
 
				-	if (test_chandle != NULL)
			
 
				+	struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				+	HASH_FIND_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], &chandle->mpi_tag, hashlist);
			
 
				+	if (hashlist == NULL)
			
 
				 	{
			
 
				-		HASH_DEL(_starpu_mpi_copy_handle_hashmap, chandle);
			
 
				-		_STARPU_MPI_DEBUG(3, "Deleting copied handle %p with tag %d from the hashmap. \n", chandle, chandle->mpi_tag);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		_STARPU_MPI_DEBUG(3, "Warning delete_chandle : copied handle %p with tag %d isn't in the hashmap. \n", chandle, chandle->mpi_tag);
			
 
				+		hashlist = malloc(sizeof(struct _starpu_mpi_copy_handle_hashlist));
			
 
				+		hashlist->list = _starpu_mpi_copy_handle_list_new();
			
 
				+		hashlist->mpi_tag = chandle->mpi_tag;
			
 
				+		HASH_ADD_INT(_starpu_mpi_copy_handle_hashmap[chandle->source], mpi_tag, hashlist);
			
 
				 	}
			
 
				+	_starpu_mpi_copy_handle_list_push_back(hashlist->list, chandle);
			
 
				+	_starpu_mpi_copy_handle_hashmap_count ++;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
			
 
				+{
			
 
				+	_STARPU_MPI_DEBUG(60, "Trying to delete chandle %p with tag %d in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				+	struct _starpu_mpi_copy_handle *found = pop_chandle(chandle->mpi_tag, chandle->source, 1);
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(found == chandle,
			
 
				+			  "Error delete_chandle : chandle %p with tag %d is NOT in the hashmap[%d]\n", chandle, chandle->mpi_tag, chandle->source);
			
 
				+
			
 
				+	_starpu_mpi_copy_handle_hashmap_count --;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	_starpu_mpi_copy_handle_display_hash(chandle->source, chandle->mpi_tag);
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
			
@@ -183,7 +250,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 	/* Initialize the request structure */
			
 
				 	req->data_handle = NULL;
			
 
				 
			
 
				-	req->datatype = NULL;
			
 
				+	req->datatype = 0;
			
 
				 	req->ptr = NULL;
			
 
				 	req->count = -1;
			
 
				 	req->user_datatype = -1;
			
@@ -195,7 +262,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 	req->func = NULL;
			
 
				 
			
 
				 	req->status = NULL;
			
 
				-	req->request = NULL;
			
 
				+	req->request = 0;
			
 
				 	req->flag = NULL;
			
 
				 
			
 
				 	req->ret = -1;
			
@@ -216,7 +283,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 	req->callback = NULL;
			
 
				 	req->callback_arg = NULL;
			
 
				 
			
 
				-	req->size_req = NULL;
			
 
				+	req->size_req = 0;
			
 
				 	req->internal_req = NULL;
			
 
				 	req->is_internal_req = 0;
			
 
				 	req->envelope = NULL;
			
@@ -236,7 +303,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 							       enum starpu_data_access_mode mode,
			
 
				 							       int sequential_consistency,
			
 
				 							       int is_internal_req,
			
 
				-							       ssize_t psize)
			
 
				+							       ssize_t count)
			
 
				 {
			
 
				 
			
 
				 	 _STARPU_MPI_LOG_IN();
			
@@ -258,7 +325,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 	 req->func = func;
			
 
				 	 req->sequential_consistency = sequential_consistency;
			
 
				 	 req->is_internal_req = is_internal_req;
			
 
				-	 req->count = psize;
			
 
				+	 req->count = count;
			
 
				 
			
 
				 	 /* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				 	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
@@ -312,11 +379,12 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 
			
 
				 	if (req->user_datatype == 0)
			
 
				 	{
			
 
				+		int size;
			
 
				 		req->count = 1;
			
 
				 		req->ptr = starpu_data_get_local_ptr(req->data_handle);
			
 
				 
			
 
				-		req->envelope->psize = (ssize_t)req->count;
			
 
				-
			
 
				+		MPI_Type_size(req->datatype, &size);
			
 
				+		req->envelope->size = (ssize_t)req->count * size;
			
 
				 		_STARPU_MPI_DEBUG(1, "Post MPI isend count (%ld) datatype_size %ld request to %d with tag %d\n",req->count,starpu_data_get_size(req->data_handle),req->srcdst, _starpu_mpi_tag);
			
 
				 		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 	}
			
@@ -325,30 +393,30 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 
				 		int ret;
			
 
				 
			
 
				  		// Do not pack the data, just try to find out the size
			
 
				-		starpu_data_pack(req->data_handle, NULL, &(req->envelope->psize));
			
 
				+		starpu_data_pack(req->data_handle, NULL, &(req->envelope->size));
			
 
				 
			
 
				-		if (req->envelope->psize != -1)
			
 
				+		if (req->envelope->size != -1)
			
 
				  		{
			
 
				  			// We already know the size of the data, let's send it to overlap with the packing of the data
			
 
				-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				-			req->count = req->envelope->psize;
			
 
				+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", req->envelope->size, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				+			req->count = req->envelope->size;
			
 
				 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
			
 
				  		}
			
 
				 
			
 
				  		// Pack the data
			
 
				  		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
			
 
				-		if (req->envelope->psize == -1)
			
 
				+		if (req->envelope->size == -1)
			
 
				  		{
			
 
				  			// We know the size now, let's send it
			
 
				-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
			
 
				  		}
			
 
				  		else
			
 
				  		{
			
 
				  			// We check the size returned with the 2 calls to pack is the same
			
 
				-			STARPU_ASSERT_MSG(req->count == req->envelope->psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->psize);
			
 
				+			STARPU_ASSERT_MSG(req->count == req->envelope->size, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->size);
			
 
				  		}
			
 
				 		// We can send the data now
			
 
				 	}
			
@@ -415,7 +483,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
				 
			
 
				 	STARPU_ASSERT_MSG(req->ptr, "Invalid pointer to receive data");
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(2, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				+	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 
			
 
				 	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
			
 
				 
			
@@ -435,9 +503,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
 
				 
			
 
				-static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, ssize_t psize)
			
 
				+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, ssize_t count)
			
 
				 {
			
 
				-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, psize);
			
 
				+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, count);
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
			
@@ -445,12 +513,12 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	STARPU_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
			
 
				 
			
 
				-	// We check if a tag is defined for the data handle, if not,
			
 
				-	// we define the one given for the communication.
			
 
				-	// A tag is necessary for the internal mpi engine.
			
 
				-	int tag = starpu_data_get_tag(data_handle);
			
 
				-	if (tag == -1)
			
 
				-		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+//	// We check if a tag is defined for the data handle, if not,
			
 
				+//	// we define the one given for the communication.
			
 
				+//	// A tag is necessary for the internal mpi engine.
			
 
				+//	int tag = starpu_data_get_tag(data_handle);
			
 
				+//	if (tag == -1)
			
 
				+//		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				 
			
 
				 	struct _starpu_mpi_req *req;
			
 
				 	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL, 1, 0, 0);
			
@@ -466,12 +534,12 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				-	// We check if a tag is defined for the data handle, if not,
			
 
				-	// we define the one given for the communication.
			
 
				-	// A tag is necessary for the internal mpi engine.
			
 
				-	int tag = starpu_data_get_tag(data_handle);
			
 
				-	if (tag == -1)
			
 
				-		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+//	// We check if a tag is defined for the data handle, if not,
			
 
				+//	// we define the one given for the communication.
			
 
				+//	// A tag is necessary for the internal mpi engine.
			
 
				+//	int tag = starpu_data_get_tag(data_handle);
			
 
				+//	if (tag == -1)
			
 
				+//		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				 
			
 
				 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, 1, 0, 0);
			
 
				 	_STARPU_MPI_LOG_OUT();
			
@@ -482,6 +550,13 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				+//	// We check if a tag is defined for the data handle, if not,
			
 
				+//	// we define the one given for the communication.
			
 
				+//	// A tag is necessary for the internal mpi engine.
			
 
				+//	int tag = starpu_data_get_tag(data_handle);
			
 
				+//	if (tag == -1)
			
 
				+//		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+
			
 
				 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, sequential_consistency, 0, 0);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
@@ -493,12 +568,12 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, M
 
				 	starpu_mpi_req req;
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				-	// We check if a tag is defined for the data handle, if not,
			
 
				-	// we define the one given for the communication.
			
 
				-	// A tag is necessary for the internal mpi engine.
			
 
				-	int tag = starpu_data_get_tag(data_handle);
			
 
				-	if (tag == -1)
			
 
				-		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+//	// We check if a tag is defined for the data handle, if not,
			
 
				+//	// we define the one given for the communication.
			
 
				+//	// A tag is necessary for the internal mpi engine.
			
 
				+//	int tag = starpu_data_get_tag(data_handle);
			
 
				+//	if (tag == -1)
			
 
				+//		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				 
			
 
				 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
			
 
				 	starpu_mpi_wait(&req, status);
			
@@ -779,7 +854,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 
			
 
				 	if (req->internal_req)
			
 
				 	{
			
 
				-		struct _starpu_mpi_copy_handle *chandle = find_chandle(starpu_data_get_tag(req->data_handle));
			
 
				+		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
			
 
				+		STARPU_ASSERT_MSG(chandle, "Could not find a copy data handle with the tag %d and the node %d\n", req->mpi_tag, req->srcdst);
			
 
				 		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
			
 
				 		delete_chandle(chandle);
			
 
				 		free(chandle);
			
@@ -839,6 +915,7 @@ struct _starpu_mpi_copy_cb_args
 
				 	starpu_data_handle_t data_handle;
			
 
				 	starpu_data_handle_t copy_handle;
			
 
				 	struct _starpu_mpi_req *req;
			
 
				+	void *buffer;
			
 
				 };
			
 
				 
			
 
				 static void _starpu_mpi_copy_cb(void* arg)
			
@@ -850,19 +927,30 @@ static void _starpu_mpi_copy_cb(void* arg)
 
				 	args->req->request = args->req->internal_req->request;
			
 
				 	args->req->submitted = 1;
			
 
				 
			
 
				-	struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
			
 
				-	void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
			
 
				-	void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
			
 
				-
			
 
				-	if (!itf->copy_methods->ram_to_ram)
			
 
				+	if (args->buffer)
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
			
 
				-		itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
			
 
				+		/* Data has been received as a raw memory, it has to be unpacked */
			
 
				+		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->copy_handle);
			
 
				+		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
			
 
				+		itf_dst->unpack_data(args->data_handle, 0, args->buffer, itf_src->get_size(args->copy_handle));
			
 
				+		free(args->buffer);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
			
 
				-		itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
			
 
				+		struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
			
 
				+		void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
			
 
				+		void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
			
 
				+
			
 
				+		if (!itf->copy_methods->ram_to_ram)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(3, "Initiating any_to_any copy..\n");
			
 
				+			itf->copy_methods->any_to_any(itf_src, 0, itf_dst, 0, NULL);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(3, "Initiating ram_to_ram copy..\n");
			
 
				+			itf->copy_methods->ram_to_ram(itf_src, 0, itf_dst, 0);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "Done, handling release of copy_handle..\n");
			
@@ -889,7 +977,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p tag %d and type %s\n", req, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
			
 
				+	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
@@ -929,7 +1017,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 		else
			
 
				 		{
			
 
				 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
			
 
				-			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag);
			
 
				+			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag, req->srcdst);
			
 
				 
			
 
				 			/* Case : the request has already been submitted internally by StarPU.
			
 
				 			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
			
@@ -937,6 +1025,13 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 			 * bring the data back to the original data handle associated to the request.*/
			
 
				 			if (chandle)
			
 
				 			{
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req_mutex));
			
 
				+				while (!(chandle->req_ready))
			
 
				+					STARPU_PTHREAD_COND_WAIT(&(chandle->req_cond), &(chandle->req_mutex));
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req_mutex));
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+
			
 
				 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
			
 
				 				STARPU_ASSERT(req->data_handle != chandle->handle);
			
 
				 
			
@@ -945,6 +1040,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
			
 
				 				cb_args->data_handle = req->data_handle;
			
 
				 				cb_args->copy_handle = chandle->handle;
			
 
				+				cb_args->buffer = chandle->buffer;
			
 
				 				cb_args->req = req;
			
 
				 
			
 
				 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
			
@@ -954,6 +1050,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 			 * We just add the pending receive request to the requests' hashmap. */
			
 
				 			else
			
 
				 			{
			
 
				+				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
			
 
				 				add_app_req(req);
			
 
				 			}
			
 
				 		}
			
@@ -1119,6 +1216,7 @@ static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 
				 static void *_starpu_mpi_progress_thread_func(void *arg)
			
 
				 {
			
 
				 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
			
 
				+	int rank, worldsize;
			
 
				 
			
 
				 	if (argc_argv->initialize_mpi)
			
 
				 	{
			
@@ -1137,10 +1235,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
			
 
				 	}
			
 
				 
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
			
 
				+
			
 
				 	{
			
 
				-		int rank, worldsize;
			
 
				-		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-		MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				 		TRACE_MPI_START(rank, worldsize);
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		starpu_profiling_set_id(rank);
			
@@ -1162,7 +1261,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
			
 
				 	{
			
 
				 		/* shall we block ? */
			
 
				-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0);
			
 
				+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (_starpu_mpi_app_req_hashmap_count == 0);
			
 
				 
			
 
				 #ifndef STARPU_MPI_ACTIVITY
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
			
@@ -1202,7 +1301,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
			
 
				 		 * requests in our side, we resubmit a header request. */
			
 
				 		MPI_Request header_req;
			
 
				-		if ((HASH_COUNT(_starpu_mpi_app_req_hashmap) > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
			
 
				+		if ((_starpu_mpi_app_req_hashmap_count > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
			
 
				 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
			
@@ -1226,37 +1325,53 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 			if (flag)
			
 
				 			{
			
 
				-				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d (size %ld)\n", recv_env->mpi_tag, recv_env->psize);
			
 
				+				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d and source %d (size %ld)\n", recv_env->mpi_tag, status.MPI_SOURCE, recv_env->size);
			
 
				 
			
 
				-				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag);
			
 
				+				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
			
 
				 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
			
 
				 				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
			
 
				 				if (!found_req)
			
 
				 				{
			
 
				-					_STARPU_MPI_DEBUG(3, "Request with tag %d not found, creating a copy_handle to receive incoming data..\n",recv_env->mpi_tag);
			
 
				+					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a copy_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				 					starpu_data_handle_t data_handle = NULL;
			
 
				 
			
 
				-					while(!(data_handle))
			
 
				-					{
			
 
				-						STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-						data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
			
 
				-						STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				-					}
			
 
				-					STARPU_ASSERT(data_handle);
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+					data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				-					struct _starpu_mpi_copy_handle* chandle = malloc(sizeof(struct _starpu_mpi_copy_handle));
			
 
				+					struct _starpu_mpi_copy_handle* chandle = calloc(1, sizeof(struct _starpu_mpi_copy_handle));
			
 
				 					STARPU_ASSERT(chandle);
			
 
				-
			
 
				+					STARPU_PTHREAD_MUTEX_INIT(&chandle->req_mutex, NULL);
			
 
				+					STARPU_PTHREAD_COND_INIT(&chandle->req_cond, NULL);
			
 
				 					chandle->mpi_tag = recv_env->mpi_tag;
			
 
				 					chandle->env = recv_env;
			
 
				-					starpu_data_register_same(&chandle->handle, data_handle);
			
 
				-					add_chandle(chandle);
			
 
				+					chandle->source = status.MPI_SOURCE;
			
 
				+
			
 
				+					if (data_handle)
			
 
				+					{
			
 
				+						chandle->buffer = NULL;
			
 
				+						starpu_data_register_same(&chandle->handle, data_handle);
			
 
				+						add_chandle(chandle);
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						/* The application has not registered yet a data with the tag,
			
 
				+						 * we are going to receive the data as a raw memory, and give it
			
 
				+						 * to the application when it post a receive for this tag
			
 
				+						 */
			
 
				+						_STARPU_MPI_DEBUG(20, "Posting a receive for a data of size %d which has not yet been registered\n", (int)chandle->env->size);
			
 
				+						chandle->buffer = malloc(chandle->env->size);
			
 
				+						starpu_vector_data_register(&chandle->handle, 0, (uintptr_t) chandle->buffer, chandle->env->size, 1);
			
 
				+						add_chandle(chandle);
			
 
				+					}
			
 
				 
			
 
				-					_STARPU_MPI_DEBUG(3, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->psize);
			
 
				+					_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->size);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 					// We wait until the request is pushed in the
			
 
				 					// new_request list, that ensures that the next loop
			
@@ -1268,6 +1383,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					while (!(chandle->req->posted))
			
 
				 					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
			
 
				+
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&chandle->req_mutex);
			
 
				+					chandle->req_ready = 1;
			
 
				+					STARPU_PTHREAD_COND_BROADCAST(&chandle->req_cond);
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&chandle->req_mutex);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 				}
			
 
				 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
			
@@ -1286,7 +1406,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					}
			
 
				 					else
			
 
				 					{
			
 
				-						found_req->count = recv_env->psize;
			
 
				+						found_req->count = recv_env->size;
			
 
				 						found_req->ptr = malloc(found_req->count);
			
 
				 
			
 
				 						STARPU_ASSERT_MSG(found_req->ptr, "cannot allocate message of size %ld\n", found_req->count);
			
@@ -1313,8 +1433,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
			
 
				 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
			
 
				 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
			
 
				-	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0, "Number of receive requests left is not zero");
			
 
				-	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0, "Number of copy requests left is not zero");
			
 
				+	STARPU_ASSERT_MSG(_starpu_mpi_app_req_hashmap_count == 0, "Number of receive requests left is not zero");
			
 
				+	STARPU_ASSERT_MSG(_starpu_mpi_copy_handle_hashmap_count == 0, "Number of copy requests left is not zero");
			
 
				+
			
 
				 	if (argc_argv->initialize_mpi)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(3, "Calling MPI_Finalize()\n");
			
@@ -1323,6 +1444,21 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 
			
 
				+	{
			
 
				+		int n;
			
 
				+		struct _starpu_mpi_copy_handle_hashlist *hashlist;
			
 
				+
			
 
				+		for(n=0 ; n<worldsize; n++)
			
 
				+		{
			
 
				+			for(hashlist=_starpu_mpi_copy_handle_hashmap[n]; hashlist != NULL; hashlist=hashlist->hh.next)
			
 
				+			{
			
 
				+				_starpu_mpi_copy_handle_list_delete(hashlist->list);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(_starpu_mpi_app_req_hashmap);
			
 
				+	free(_starpu_mpi_copy_handle_hashmap);
			
 
				 	free(argc_argv);
			
 
				 	free(recv_env);
			
 
				 
			
@@ -1406,6 +1542,16 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
				 	_starpu_mpi_add_sync_point_in_fxt();
			
 
				 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
			
 
				 	_starpu_mpi_cache_init(MPI_COMM_WORLD);
			
 
				+
			
 
				+	{
			
 
				+		int nb_nodes, k;
			
 
				+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
			
 
				+		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
			
 
				+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
			
 
				+		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
			
 
				+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -86,7 +86,7 @@ enum _starpu_mpi_request_type
 
				 
			
 
				 struct _starpu_mpi_envelope
			
 
				 {
			
 
				-	ssize_t psize;
			
 
				+	ssize_t size;
			
 
				 	int mpi_tag;
			
 
				 };
			
 
				 
			
--- a/mpi/src/starpu_mpi_insert_task.c
+++ b/mpi/src/starpu_mpi_insert_task.c
@@ -22,7 +22,7 @@
 
				 #include <starpu_data.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <common/uthash.h>
			
 
				-#include <util/starpu_insert_task_utils.h>
			
 
				+#include <util/starpu_task_insert_utils.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				 #include <core/task.h>
			
 
				 
			
@@ -195,7 +195,7 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
 
				 			 * The application knows we won't do anything
			
 
				 			 * about this task */
			
 
				 			/* Yes, the app could actually not call
			
 
				-			 * insert_task at all itself, this is just a
			
 
				+			 * task_insert at all itself, this is just a
			
 
				 			 * safeguard. */
			
 
				 			_STARPU_MPI_DEBUG(3, "oh oh\n");
			
 
				 			_STARPU_MPI_LOG_OUT();
			
@@ -363,7 +363,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 	}
			
 
				 }
			
 
				 
			
 
				-int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 {
			
 
				 	int arg_type;
			
 
				 	va_list varg_list;
			
@@ -585,7 +585,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 	{
			
 
				 		/* Get the number of buffers and the size of the arguments */
			
 
				 		va_start(varg_list, codelet);
			
 
				-		arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
			
 
				+		arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
			
 
				 
			
 
				 		/* Pack arguments if needed */
			
 
				 		if (arg_buffer_size)
			
@@ -596,13 +596,16 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 
			
 
				 		_STARPU_MPI_DEBUG(1, "Execution of the codelet %p (%s)\n", codelet, codelet->name);
			
 
				 		va_start(varg_list, codelet);
			
 
				+
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg_free = 1;
			
 
				+
			
 
				 		if (codelet->nbuffers > STARPU_NMAXBUFS)
			
 
				 		{
			
 
				 			task->dyn_handles = malloc(codelet->nbuffers * sizeof(starpu_data_handle_t));
			
 
				 		}
			
 
				-		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
			
 
				-		STARPU_ASSERT_MSG(ret==0, "_starpu_insert_task_create_and_submit failure %d", ret);
			
 
				+		int ret = _starpu_task_insert_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
			
 
				+		STARPU_ASSERT_MSG(ret==0, "_starpu_task_insert_create_and_submit failure %d", ret);
			
 
				 	}
			
 
				 
			
 
				 	if (inconsistent_execute)
			
@@ -809,7 +812,60 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/* TODO: this should rather be implicitly called by starpu_mpi_insert_task when
			
 
				+struct _starpu_mpi_redux_data_args
			
 
				+{
			
 
				+	starpu_data_handle_t data_handle;
			
 
				+	starpu_data_handle_t new_handle;
			
 
				+	int tag;
			
 
				+	int node;
			
 
				+	MPI_Comm comm;
			
 
				+	struct starpu_task *taskB;
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[], STARPU_ATTRIBUTE_UNUSED void *cl_arg)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet _starpu_mpi_redux_data_read_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.name = "_starpu_mpi_redux_data_read_cl"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.name = "_starpu_mpi_redux_data_write_cl"
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_redux_data_detached_callback(void *arg)
			
 
				+{
			
 
				+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
			
 
				+
			
 
				+	STARPU_TASK_SET_HANDLE(args->taskB, args->new_handle, 1);
			
 
				+	int ret = starpu_task_submit(args->taskB);
			
 
				+	STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+	starpu_data_unregister_submit(args->new_handle);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
			
 
				+{
			
 
				+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
			
 
				+	starpu_data_register_same(&args->new_handle, args->data_handle);
			
 
				+
			
 
				+	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
			
 
				+}
			
 
				+
			
 
				+/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
			
 
				  * a data previously accessed in REDUX mode gets accessed in R mode. */
			
 
				 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 {
			
@@ -836,46 +892,68 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 	// need to count how many nodes have the data in redux mode
			
 
				 	if (me == rank)
			
 
				 	{
			
 
				-		int i;
			
 
				+		int i, j=0;
			
 
				+		struct starpu_task *taskBs[nb_nodes];
			
 
				 
			
 
				 		for(i=0 ; i<nb_nodes ; i++)
			
 
				 		{
			
 
				 			if (i != rank)
			
 
				 			{
			
 
				-				starpu_data_handle_t new_handle;
			
 
				-
			
 
				-				starpu_data_register_same(&new_handle, data_handle);
			
 
				-
			
 
				-				_STARPU_MPI_DEBUG(1, "Receiving redux handle from %d in %p ...\n", i, new_handle);
			
 
				-
			
 
				-				/* FIXME: we here allocate a lot of data: one
			
 
				-				 * instance per MPI node and per number of
			
 
				-				 * times we are called. We should rather do
			
 
				-				 * that much later, e.g. after data_handle
			
 
				-				 * finished its last read access, by submitting
			
 
				-				 * an empty task A reading data_handle whose
			
 
				-				 * callback submits the mpi comm, whose
			
 
				-				 * callback submits the redux_cl task B with
			
 
				-				 * sequential consistency set to 0, and submit
			
 
				-				 * an empty task C writing data_handle and
			
 
				-				 * depending on task B, just to replug with
			
 
				-				 * implicit data dependencies with tasks
			
 
				-				 * inserted after this reduction.
			
 
				+				/* We need to make sure all is
			
 
				+				 * executed after data_handle finished
			
 
				+				 * its last read access, we hence do
			
 
				+				 * the following:
			
 
				+				 * - submit an empty task A reading
			
 
				+				 * data_handle whose callback submits
			
 
				+				 * the mpi comm with sequential
			
 
				+				 * consistency set to 0, whose
			
 
				+				 * callback submits the redux_cl task
			
 
				+				 * B with sequential consistency set
			
 
				+				 * to 0,
			
 
				+				 * - submit an empty task C reading
			
 
				+				 * and writing data_handle and
			
 
				+				 * depending on task B, just to replug
			
 
				+				 * with implicit data dependencies
			
 
				+				 * with tasks inserted after this
			
 
				+				 * reduction.
			
 
				 				 */
			
 
				-				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
			
 
				-				starpu_insert_task(data_handle->redux_cl,
			
 
				-						   STARPU_RW, data_handle,
			
 
				-						   STARPU_R, new_handle,
			
 
				+
			
 
				+				/* FIXME: free args */
			
 
				+				struct _starpu_mpi_redux_data_args *args = malloc(sizeof(struct _starpu_mpi_redux_data_args));
			
 
				+				args->data_handle = data_handle;
			
 
				+				args->tag = tag;
			
 
				+				args->node = i;
			
 
				+				args->comm = comm;
			
 
				+
			
 
				+				// We need to create taskB early as
			
 
				+				// taskC declares a dependancy on it
			
 
				+				args->taskB = starpu_task_create();
			
 
				+				args->taskB->cl = args->data_handle->redux_cl;
			
 
				+				args->taskB->sequential_consistency = 0;
			
 
				+				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
			
 
				+				taskBs[j] = args->taskB; j++;
			
 
				+
			
 
				+				// Submit taskA
			
 
				+				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
			
 
				+						   STARPU_R, data_handle,
			
 
				+						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
			
 
				 						   0);
			
 
				-				starpu_data_unregister_submit(new_handle);
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				+		// Submit taskC which depends on all taskBs created
			
 
				+		struct starpu_task *taskC = starpu_task_create();
			
 
				+		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
			
 
				+		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
			
 
				+		starpu_task_declare_deps_array(taskC, j, taskBs);
			
 
				+		int ret = starpu_task_submit(taskC);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
			
 
				 		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
			
 
				-		starpu_insert_task(data_handle->init_cl, STARPU_W, data_handle, 0);
			
 
				+		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
			
 
				 	}
			
 
				 	/* FIXME: In order to prevent simultaneous receive submissions
			
 
				 	 * on the same handle, we need to wait that all the starpu_mpi
			
--- a/mpi/src/starpu_mpi_task_insert.h
+++ b/mpi/src/starpu_mpi_task_insert.h
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -102,7 +102,9 @@ starpu_mpi_TESTS =				\
 
				 	multiple_send				\
			
 
				 	mpi_scatter_gather			\
			
 
				 	mpi_reduction				\
			
 
				-	user_defined_datatype
			
 
				+	user_defined_datatype			\
			
 
				+	gather					\
			
 
				+	gather2
			
 
				 
			
 
				 noinst_PROGRAMS =				\
			
 
				 	datatypes				\
			
@@ -130,7 +132,9 @@ noinst_PROGRAMS =				\
 
				 	multiple_send				\
			
 
				 	mpi_scatter_gather			\
			
 
				 	mpi_reduction				\
			
 
				-	user_defined_datatype
			
 
				+	user_defined_datatype			\
			
 
				+	gather					\
			
 
				+	gather2
			
 
				 
			
 
				 mpi_isend_LDADD =					\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
@@ -184,6 +188,10 @@ mpi_reduction_LDADD =			\
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 user_defined_datatype_LDADD =			\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+gather_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+gather2_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 
			
 
				 ring_SOURCES = ring.c
			
 
				 ring_async_SOURCES = ring_async.c
			
--- a/mpi/tests/gather.c
+++ b/mpi/tests/gather.c
@@ -0,0 +1,76 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int var;
			
 
				+
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size<3)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need more than 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int n;
			
 
				+		for(n=1 ; n<size ; n++)
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+
			
 
				+			FPRINTF_MPI("receiving from node %d\n", n);
			
 
				+			starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
			
 
				+			starpu_mpi_recv(handle, n, 42, MPI_COMM_WORLD, &status);
			
 
				+			starpu_data_acquire(handle, STARPU_R);
			
 
				+			STARPU_ASSERT_MSG(var == n, "Received incorrect value <%d> from node <%d>\n", var, n);
			
 
				+			FPRINTF_MPI("received <%d> from node %d\n", var, n);
			
 
				+			starpu_data_release(handle);
			
 
				+			starpu_data_unregister(handle);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		FPRINTF_MPI("sending to node %d\n", 0);
			
 
				+		var = rank;
			
 
				+		starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(var));
			
 
				+		starpu_mpi_send(handle, 0, 42, MPI_COMM_WORLD);
			
 
				+		starpu_data_unregister(handle);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/mpi/tests/gather2.c
+++ b/mpi/tests/gather2.c
@@ -0,0 +1,98 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size<3)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need more than 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int n;
			
 
				+		for(n=1 ; n<size ; n++)
			
 
				+		{
			
 
				+			int i, var[2];
			
 
				+			MPI_Status status[3];
			
 
				+			starpu_data_handle_t handle[2];
			
 
				+
			
 
				+			FPRINTF_MPI("receiving from node %d\n", n);
			
 
				+			for(i=0 ; i<2 ; i++)
			
 
				+				starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
			
 
				+
			
 
				+			starpu_mpi_recv(handle[0], n, 42, MPI_COMM_WORLD, &status[0]);
			
 
				+			starpu_data_acquire(handle[0], STARPU_R);
			
 
				+			STARPU_ASSERT_MSG(var[0] == n, "Received incorrect value <%d> from node <%d>\n", var[0], n);
			
 
				+			FPRINTF_MPI("received <%d> from node %d\n", var[0], n);
			
 
				+			starpu_data_release(handle[0]);
			
 
				+
			
 
				+			starpu_mpi_recv(handle[0], n, 42, MPI_COMM_WORLD, &status[1]);
			
 
				+			starpu_mpi_recv(handle[1], n, 44, MPI_COMM_WORLD, &status[2]);
			
 
				+			for(i=0 ; i<2 ; i++)
			
 
				+				starpu_data_acquire(handle[i], STARPU_R);
			
 
				+			STARPU_ASSERT_MSG(var[0] == n*2, "Received incorrect value <%d> from node <%d>\n", var[0], n);
			
 
				+			STARPU_ASSERT_MSG(var[1] == n*4, "Received incorrect value <%d> from node <%d>\n", var[0], n);
			
 
				+			FPRINTF_MPI("received <%d> and <%d> from node %d\n", var[0], var[1], n);
			
 
				+			for(i=0 ; i<2 ; i++)
			
 
				+				starpu_data_release(handle[i]);
			
 
				+			for(i=0 ; i<2 ; i++)
			
 
				+				starpu_data_unregister(handle[i]);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int i, var[3];
			
 
				+		starpu_data_handle_t handle[3];
			
 
				+
			
 
				+		FPRINTF_MPI("sending to node %d\n", 0);
			
 
				+		var[0] = rank;
			
 
				+		var[1] = var[0] * 2;
			
 
				+		var[2] = var[0] * 4;
			
 
				+		for(i=0 ; i<3 ; i++)
			
 
				+			starpu_variable_data_register(&handle[i], 0, (uintptr_t)&var[i], sizeof(var[i]));
			
 
				+		starpu_mpi_send(handle[0], 0, 42, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(handle[1], 0, 42, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(handle[2], 0, 44, MPI_COMM_WORLD);
			
 
				+		for(i=0 ; i<3 ; i++)
			
 
				+			starpu_data_unregister(handle[i]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/mpi/tests/insert_task.c
+++ b/mpi/tests/insert_task.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -102,14 +102,14 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				-	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 
			
 
				 	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
--- a/mpi/tests/insert_task_block.c
+++ b/mpi/tests/insert_task_block.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -125,10 +125,10 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (y = 0; y < BLOCKS; y++)
			
 
				 		{
			
 
				-			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+			ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
			
 
				 						     STARPU_RW, data_handles[x][y],
			
 
				 						     0);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/mpi/tests/insert_task_cache.c
+++ b/mpi/tests/insert_task_cache.c
@@ -82,14 +82,14 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
				 
			
 
				 	for(i = 0; i < 5; i++)
			
 
				 	{
			
 
				-		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+		ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 	}
			
 
				 
			
 
				 	for(i = 0; i < 5; i++)
			
 
				 	{
			
 
				-		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+		ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
@@ -130,7 +130,7 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		dst = (rank == 0) ? 1 : 0;
			
 
				 		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
			
 
				-		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
			
 
				+		FPRINTF_MPI("Communication cache mechanism is %sworking\n", result?"":"NOT ");
			
 
				 	}
			
 
				 	else
			
 
				 		result = 1;
			
--- a/mpi/tests/insert_task_owner.c
+++ b/mpi/tests/insert_task_owner.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -101,41 +101,41 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	node = starpu_data_get_rank(data_handlesx1);
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
			
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				 	node = starpu_data_get_rank(data_handlesx0);
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_r,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
			
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
			
 
				 				     0);
			
 
				 	assert(err == -EINVAL);
			
 
				 
			
 
				 	node = 1;
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				 	node = 0;
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				 	node = 0;
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_r,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				 				     0);
			
@@ -145,7 +145,7 @@ int main(int argc, char **argv)
 
				 	   going to overwrite the node even though the data model clearly specifies
			
 
				 	   which node is going to execute the codelet */
			
 
				 	node = 0;
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				 				     0);
			
@@ -155,13 +155,13 @@ int main(int argc, char **argv)
 
				 	   going to overwrite the node even though the data model clearly specifies
			
 
				 	   which node is going to execute the codelet */
			
 
				 	node = 0;
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_w_r,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				-	fprintf(stderr, "Waiting ...\n");
			
 
				+	FPRINTF_MPI("Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 	starpu_data_unregister(data_handlesx0);
			
 
				 	starpu_data_unregister(data_handlesx1);
			
--- a/mpi/tests/insert_task_owner2.c
+++ b/mpi/tests/insert_task_owner2.c
@@ -93,12 +93,12 @@ int main(int argc, char **argv)
 
				 	starpu_data_set_rank(data_handles[3], 1);
			
 
				 	starpu_data_set_tag(data_handles[3], 3);
			
 
				 
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
			
 
				 				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
			
 
				 				     STARPU_W, data_handles[2],
			
 
				 				     STARPU_W, data_handles[3],
			
 
				 				     STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
			
 
				+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_task_insert");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	int *values = malloc(4 * sizeof(int *));
			
--- a/mpi/tests/insert_task_owner_data.c
+++ b/mpi/tests/insert_task_owner_data.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -72,7 +72,7 @@ int main(int argc, char **argv)
 
				 	starpu_data_set_rank(data_handles[1], 1);
			
 
				 	starpu_data_set_tag(data_handles[1], 1);
			
 
				 
			
 
				-	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
			
 
				 				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				 				     STARPU_EXECUTE_ON_DATA, data_handles[1],
			
 
				 				     0);
			
--- a/mpi/tests/mpi_earlyrecv.c
+++ b/mpi/tests/mpi_earlyrecv.c
@@ -52,12 +52,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				-	fprintf(stderr, "rank %d exchanging with rank %d\n", rank, other_rank);
			
 
				+	FPRINTF_MPI("rank %d exchanging with rank %d\n", rank, other_rank);
			
 
				 
			
 
				 	if (rank%2)
			
 
				 	{
			
 
				 		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				-		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				 		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				 		nb_requests = 2;
			
 
				 	}
			
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
 
				 				MPI_Status status;
			
 
				 				starpu_mpi_test(&request[i], &flag, &status);
			
 
				 				if (flag)
			
 
				-					fprintf(stderr, "request[%d] = %d %p\n", i, flag, request[i]);
			
 
				+					FPRINTF_MPI("request[%d] = %d %p\n", i, flag, request[i]);
			
 
				 			}
			
 
				 		}
			
 
				 		finished = request[0] == NULL;
			
--- a/mpi/tests/mpi_earlyrecv2.c
+++ b/mpi/tests/mpi_earlyrecv2.c
@@ -99,7 +99,7 @@ int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detac
 
				 		{
			
 
				 			for(i=0 ; i<NB ; i++)
			
 
				 			{
			
 
				-			     starpu_mpi_wait(&req[i], NULL);
			
 
				+			     starpu_mpi_wait(&req[i], MPI_STATUS_IGNORE);
			
 
				 			     func(handles[i], i, rank, &ret);
			
 
				 			}
			
 
				 		}
			
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -17,6 +17,7 @@
 
				 
			
 
				 #include <starpu_mpi.h>
			
 
				 #include <math.h>
			
 
				+#include "helper.h"
			
 
				 
			
 
				 extern void init_cpu_func(void *descr[], void *cl_arg);
			
 
				 extern void redux_cpu_func(void *descr[], void *cl_arg);
			
@@ -111,6 +112,7 @@ int main(int argc, char **argv)
 
				 	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
			
 
				 	for(x = 0; x < nb_elements; x+=step)
			
 
				 	{
			
 
				+		handles[x] = NULL;
			
 
				 		int mpi_rank = my_distrib(x/step, size);
			
 
				 		if (mpi_rank == my_rank)
			
 
				 		{
			
@@ -136,17 +138,17 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (x = 0; x < nb_elements; x+=step)
			
 
				 		{
			
 
				-			starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 					       &dot_codelet,
			
 
				 					       STARPU_R, handles[x],
			
 
				 					       STARPU_REDUX, dot_handle,
			
 
				 					       0);
			
 
				 		}
			
 
				 		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
			
 
				-		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
			
 
				+		starpu_mpi_task_insert(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "Waiting ...\n");
			
 
				+	FPRINTF_MPI("Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	for(x = 0; x < nb_elements; x+=step)
			
@@ -165,9 +167,9 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (my_rank == 0)
			
 
				 	{
			
 
				-		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
			
 
				-		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
			
 
				-		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
			
 
				+		FPRINTF(stderr, "[%d] sum=%ld\n", my_rank, sum);
			
 
				+		FPRINTF(stderr, "[%d] dot=%ld\n", my_rank, dot);
			
 
				+		FPRINTF(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/mpi/tests/mpi_reduction_kernels.c
+++ b/mpi/tests/mpi_reduction_kernels.c
@@ -17,10 +17,7 @@
 
				 #include <starpu.h>
			
 
				 #include <mpi.h>
			
 
				 
			
 
				-#define _DISPLAY(fmt, ...) do { \
			
 
				-		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
			
 
				-		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __starpu_func__ ,## __VA_ARGS__); 	\
			
 
				-		fflush(stderr); } while(0)
			
 
				+#include "helper.h"
			
 
				 
			
 
				 /*
			
 
				  *	Codelet to create a neutral element
			
@@ -29,7 +26,7 @@ void init_cpu_func(void *descr[], void *cl_arg)
 
				 {
			
 
				 	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	*dot = 0;
			
 
				-	_DISPLAY("Init dot\n");
			
 
				+	FPRINTF_MPI("Init dot\n");
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -41,7 +38,7 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 
				 	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
 
				 	*dota = *dota + *dotb;
			
 
				-	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
			
 
				+	FPRINTF_MPI("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -54,14 +51,14 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 
				 
			
 
				 	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
 
				-//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
			
 
				+	//FPRINTF_MPI("Before dot=%ld (adding %d elements...)\n", *dot, n);
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < n; i++)
			
 
				 	{
			
 
				-//		_DISPLAY("Adding %ld\n", local_x[i]);
			
 
				+		//FPRINTF_MPI("Adding %ld\n", local_x[i]);
			
 
				 		*dot += local_x[i];
			
 
				 	}
			
 
				-//	_DISPLAY("After dot=%ld\n", *dot);
			
 
				+	//FPRINTF_MPI("After dot=%ld\n", *dot);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -71,6 +68,6 @@ void display_cpu_func(void *descr[], void *cl_arg)
 
				 {
			
 
				 	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	_DISPLAY("Local=%ld\n", *local_x);
			
 
				+	FPRINTF_MPI("Local=%ld\n", *local_x);
			
 
				 }
			
 
				 
			
--- a/mpi/tests/mpi_redux.c
+++ b/mpi/tests/mpi_redux.c
@@ -26,7 +26,7 @@ void callback(void *arg)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 	*received = *received + 1;
			
 
				-	fprintf(stderr, "received = %d\n", *received);
			
 
				+	FPRINTF_MPI("received = %d\n", *received);
			
 
				 	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 }
			
@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 
				 		starpu_data_unregister_submit(handles[0]);
			
 
				 
			
 
				 		starpu_variable_data_register(&handles[0], STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(int));
			
 
				-		starpu_mpi_recv(handles[0], 0, 12+rank, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_mpi_recv(handles[0], 0, 12+rank, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				 		starpu_data_unregister(handles[0]);
			
 
				 	}
			
 
				 
			
--- a/mpi/tests/mpi_scatter_gather.c
+++ b/mpi/tests/mpi_scatter_gather.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				 
			
 
				 /* Returns the MPI node number where data indexes index is */
			
 
				 int my_distrib(int x, int y, int nb_nodes)
			
@@ -35,12 +36,12 @@ void cpu_codelet(void *descr[], void *_args)
 
				 	starpu_codelet_unpack_args(_args, &rank);
			
 
				 	factor = block[0];
			
 
				 
			
 
				-	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
			
 
				+	//FPRINTF_MPI("rank %d factor %f\n", rank, factor);
			
 
				 	for (j = 0; j < nx; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < nx; i++)
			
 
				 		{
			
 
				-			//fprintf(stderr,"rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
			
 
				+			//FPRINTF_MPI("rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
			
 
				 			block[j+i*ld] *= factor;
			
 
				 		}
			
 
				 	}
			
@@ -56,13 +57,13 @@ static struct starpu_codelet cl =
 
				 void scallback(void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	char *msg = arg;
			
 
				-	fprintf(stderr, "Sending completed for <%s>\n", msg);
			
 
				+	FPRINTF_MPI("Sending completed for <%s>\n", msg);
			
 
				 }
			
 
				 
			
 
				 void rcallback(void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	char *msg = arg;
			
 
				-	fprintf(stderr, "Reception completed for <%s>\n", msg);
			
 
				+	FPRINTF_MPI("Reception completed for <%s>\n", msg);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -177,7 +178,7 @@ int main(int argc, char **argv)
 
				 			if (owner == rank)
			
 
				 			{
			
 
				 				//fprintf(stderr,"[%d] Computing on data[%d]\n", rank, x);
			
 
				-				starpu_insert_task(&cl,
			
 
				+				starpu_task_insert(&cl,
			
 
				 						   STARPU_VALUE, &rank, sizeof(rank),
			
 
				 						   STARPU_RW, data_handles[x],
			
 
				 						   0);
			
--- a/mpi/tests/user_defined_datatype.c
+++ b/mpi/tests/user_defined_datatype.c
@@ -18,6 +18,7 @@
 
				 #include <interface/complex_interface.h>
			
 
				 #include <interface/complex_codelet.h>
			
 
				 #include <user_defined_datatype_value.h>
			
 
				+#include "helper.h"
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				 #  define ELEMENTS 10
			
@@ -100,7 +101,7 @@ int main(int argc, char **argv)
 
				 			float foo_compare=42.0;
			
 
				 			int value_compare=36;
			
 
				 
			
 
				-			fprintf(stderr, "\nTesting with function %p\n", f);
			
 
				+			FPRINTF_MPI("\nTesting with function %p\n", f);
			
 
				 
			
 
				 			if (rank == 0)
			
 
				 			{
			
@@ -153,13 +154,13 @@ int main(int argc, char **argv)
 
				 					compare = (foo[i] == foo_compare);
			
 
				 					if (compare == 0)
			
 
				 					{
			
 
				-						fprintf(stderr, "ERROR. foo[%d] == %f != %f\n", i, foo[i], foo_compare);
			
 
				+						FPRINTF_MPI("ERROR. foo[%d] == %f != %f\n", i, foo[i], foo_compare);
			
 
				 						goto end;
			
 
				 					}
			
 
				 					compare = (values[i] == value_compare);
			
 
				 					if (compare == 0)
			
 
				 					{
			
 
				-						fprintf(stderr, "ERROR. value[%d] == %d != %d\n", i, values[i], value_compare);
			
 
				+						FPRINTF_MPI("ERROR. value[%d] == %d != %d\n", i, values[i], value_compare);
			
 
				 						goto end;
			
 
				 					}
			
 
				 					for(j=0 ; j<2 ; j++)
			
@@ -167,7 +168,7 @@ int main(int argc, char **argv)
 
				 						compare = (real[i][j] == real_compare[j]);
			
 
				 						if (compare == 0)
			
 
				 						{
			
 
				-							fprintf(stderr, "ERROR. real[%d][%d] == %f != %f\n", i, j, real[i][j], real_compare[j]);
			
 
				+							FPRINTF_MPI("ERROR. real[%d][%d] == %f != %f\n", i, j, real[i][j], real_compare[j]);
			
 
				 							goto end;
			
 
				 						}
			
 
				 					}
			
@@ -176,7 +177,7 @@ int main(int argc, char **argv)
 
				 						compare = (imaginary[i][j] == imaginary_compare[j]);
			
 
				 						if (compare == 0)
			
 
				 						{
			
 
				-							fprintf(stderr, "ERROR. imaginary[%d][%d] == %f != %f\n", i, j, imaginary[i][j], imaginary_compare[j]);
			
 
				+							FPRINTF_MPI("ERROR. imaginary[%d][%d] == %f != %f\n", i, j, imaginary[i][j], imaginary_compare[j]);
			
 
				 							goto end;
			
 
				 						}
			
 
				 					}
			
--- a/sc_hypervisor/examples/cholesky/cholesky_implicit.c
+++ b/sc_hypervisor/examples/cholesky/cholesky_implicit.c
@@ -96,17 +96,17 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 		if(k == 0 && with_ctxs)
			
 
				 		{
			
 
				-			 ret = starpu_insert_task(&cl11,
			
 
				-					   STARPU_PRIORITY, prio_level,
			
 
				-					   STARPU_RW, sdatakk,
			
 
				-					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				-					   STARPU_HYPERVISOR_TAG, hypervisor_tag,
			
 
				-					   0);
			
 
				+			 ret = starpu_task_insert(&cl11,
			
 
				+						  STARPU_PRIORITY, prio_level,
			
 
				+						  STARPU_RW, sdatakk,
			
 
				+						  STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+						  STARPU_HYPERVISOR_TAG, hypervisor_tag,
			
 
				+						  0);
			
 
				 			set_hypervisor_conf(START_BENCH, hypervisor_tag++);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 		}
			
 
				 		else
			
 
				-			starpu_insert_task(&cl11,
			
 
				+			starpu_task_insert(&cl11,
			
 
				 					   STARPU_PRIORITY, prio_level,
			
 
				 					   STARPU_RW, sdatakk,
			
 
				 					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
@@ -116,12 +116,12 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 		{
			
 
				                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-                        ret = starpu_insert_task(&cl21,
			
 
				+                        ret = starpu_task_insert(&cl21,
			
 
				 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 						 STARPU_R, sdatakk,
			
 
				 						 STARPU_RW, sdatakj,
			
 
				 						 0);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
@@ -133,25 +133,25 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					if(k == (nblocks-2) && j == (nblocks-1) &&
			
 
				 					   i == (k + 1) && with_ctxs)
			
 
				 					{
			
 
				-						ret = starpu_insert_task(&cl22,
			
 
				+						ret = starpu_task_insert(&cl22,
			
 
				 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 								   STARPU_R, sdataki,
			
 
				 								   STARPU_R, sdatakj,
			
 
				 								   STARPU_RW, sdataij,
			
 
				 								   STARPU_HYPERVISOR_TAG, hypervisor_tag,
			
 
				 								   0);
			
 
				-						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 						set_hypervisor_conf(END_BENCH, hypervisor_tag++);
			
 
				 					}
			
 
				 					
			
 
				 					else
			
 
				-						ret = starpu_insert_task(&cl22,
			
 
				+						ret = starpu_task_insert(&cl22,
			
 
				 								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 								   STARPU_R, sdataki,
			
 
				 								   STARPU_R, sdatakj,
			
 
				 								   STARPU_RW, sdataij,
			
 
				 								   0);
			
 
				-						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 					
			
 
				                    }
			
 
				 			}
			
--- a/sc_hypervisor/include/sc_hypervisor.h
+++ b/sc_hypervisor/include/sc_hypervisor.h
@@ -128,6 +128,8 @@ void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total
 
				 /* change dynamically the number of the elapsed flops in a context, modify the past in order to better compute the speed */
			
 
				 void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops);
			
 
				 
			
 
				+/* updates the min and max workers needed by each context */
			
 
				+	void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs);
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/sc_hypervisor/include/sc_hypervisor_lp.h
+++ b/sc_hypervisor/include/sc_hypervisor_lp.h
@@ -65,7 +65,7 @@ unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw
 
				 /* linear program that returns 1/tmax, and computes in table res the nr of workers needed by each context st 
			
 
				    the system ends up in the smallest tmax*/
			
 
				 double sc_hypervisor_lp_simulate_distrib_flops(int nsched_ctxs, int ntypes_of_workers, double speed[nsched_ctxs][ntypes_of_workers], 
			
 
				-					       double flops[nsched_ctxs], double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
			
 
				+					       double flops[nsched_ctxs], double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers], unsigned sched_ctxs[nsched_ctxs]);
			
 
				 
			
 
				 /* linear program that simulates a distribution of tasks that minimises the execution time of the tasks in the pool */
			
 
				 double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt],
			
--- a/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
@@ -73,6 +73,7 @@ static void feft_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx,
 
				 				_try_resizing(NULL, -1, NULL, -1);
			
 
				 			}
			
 
				 		}
			
 
				+	
			
 
				 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
 
				 
			
--- a/sc_hypervisor/src/policies_utils/lp_programs.c
+++ b/sc_hypervisor/src/policies_utils/lp_programs.c
@@ -249,14 +249,14 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				-double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
			
 
				+double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw], unsigned sched_ctxs[ns])
			
 
				 {
			
 
				 	int integer = 1;
			
 
				 	int s, w;
			
 
				 	glp_prob *lp;
			
 
				 
			
 
				-	int ne =
			
 
				-		(ns*nw+1)*(ns+nw)
			
 
				+	int ne = //ns * (nw*ns + 1) +
			
 
				+		(ns*nw+1)*(2*ns+nw)
			
 
				 		+ 1; /* glp dumbness */
			
 
				 	int n = 1;
			
 
				 	int ia[ne], ja[ne];
			
@@ -338,6 +338,69 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
				 		n++;
			
 
				 	}
			
 
				 
			
 
				+	/* one row corresponds to one ctx*/
			
 
				+	glp_add_rows(lp, ns);
			
 
				+
			
 
				+	for(s = 0; s < ns; s++)
			
 
				+	{
			
 
				+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
			
 
				+		char name[32];
			
 
				+		snprintf(name, sizeof(name), "ctx%d", s);
			
 
				+		glp_set_row_name(lp, ns+s+1, name);
			
 
				+		glp_set_row_bnds(lp, ns+s+1, GLP_LO, 0., 0.);
			
 
				+		
			
 
				+
			
 
				+		int s2;
			
 
				+		for(s2 = 0; s2 < ns; s2++)
			
 
				+		{
			
 
				+			if(s2 == s)
			
 
				+			{
			
 
				+
			
 
				+				for(w = 0; w < nw; w++)
			
 
				+				{
			
 
				+					/* only for CPUs for now */
			
 
				+					if(w == 0)
			
 
				+					{
			
 
				+						ia[n] = ns+s+1;
			
 
				+						ja[n] = w+s2*nw + 1;
			
 
				+						ar[n] = 1.0;
			
 
				+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						ia[n] = ns+s+1;
			
 
				+						ja[n] = w+s2*nw + 1;
			
 
				+						ar[n] = 0.0;
			
 
				+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
 
				+
			
 
				+					}
			
 
				+					n++;
			
 
				+				}
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				for(w = 0; w < nw; w++)
			
 
				+				{
			
 
				+
			
 
				+					ia[n] = ns+s+1;
			
 
				+					ja[n] = w+s2*nw + 1;
			
 
				+					ar[n] = 0.0;
			
 
				+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
 
				+					n++;
			
 
				+				}
			
 
				+				
			
 
				+			}
			
 
				+				
			
 
				+		}
			
 
				+		ia[n] = ns+s+1;
			
 
				+		ja[n] = ns*nw+1;
			
 
				+		ar[n] = 0.0;
			
 
				+		n++;
			
 
				+		
			
 
				+		glp_set_row_bnds(lp, ns+s+1, GLP_UP, config->min_nworkers, config->max_nworkers);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				 	/*we add another linear constraint : sum(all cpus) = 9 and sum(all gpus) = 3 */
			
 
				 	glp_add_rows(lp, nw);
			
 
				 
			
@@ -345,7 +408,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
				 	{
			
 
				 		char name[32];
			
 
				 		snprintf(name, sizeof(name), "w%d", w);
			
 
				-		glp_set_row_name(lp, ns+w+1, name);
			
 
				+		glp_set_row_name(lp, 2*ns+w+1, name);
			
 
				 		for(s = 0; s < ns; s++)
			
 
				 		{
			
 
				 			int w2;
			
@@ -353,14 +416,14 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
				 			{
			
 
				 				if(w2 == w)
			
 
				 				{
			
 
				-					ia[n] = ns+w+1;
			
 
				+					ia[n] = 2*ns+w+1;
			
 
				 					ja[n] = w2+s*nw + 1;
			
 
				 					ar[n] = 1.0;
			
 
				 //					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				-					ia[n] = ns+w+1;
			
 
				+					ia[n] = 2*ns+w+1;
			
 
				 					ja[n] = w2+s*nw + 1;
			
 
				 					ar[n] = 0.0;
			
 
				 //					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
@@ -369,7 +432,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
				 			}
			
 
				 		}
			
 
				 		/* 1/tmax */
			
 
				-		ia[n] = ns+w+1;
			
 
				+		ia[n] = 2*ns+w+1;
			
 
				 		ja[n] = ns*nw+1;
			
 
				 		ar[n] = 0.0;
			
 
				 //		printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
			
@@ -377,11 +440,11 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
				 
			
 
				 		/*sum(all gpus) = 3*/
			
 
				 		if(w == 0)
			
 
				-			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[0], total_nw[0]);
			
 
				+			glp_set_row_bnds(lp, 2*ns+w+1, GLP_FX, total_nw[0], total_nw[0]);
			
 
				 
			
 
				 		/*sum(all cpus) = 9*/
			
 
				 		if(w == 1)
			
 
				-			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[1], total_nw[1]);
			
 
				+			glp_set_row_bnds(lp, 2*ns+w+1, GLP_FX, total_nw[1], total_nw[1]);
			
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT(n == ne);
			
--- a/sc_hypervisor/src/policies_utils/lp_tools.c
+++ b/sc_hypervisor/src/policies_utils/lp_tools.c
@@ -28,6 +28,8 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 
				 	double v[nsched_ctxs][ntypes_of_workers];
			
 
				 	double flops[nsched_ctxs];
			
 
				 
			
 
				+	sc_hypervisor_update_resize_interval(sched_ctxs, nsched_ctxs);
			
 
				+
			
 
				 	int nw = tw->nw;
			
 
				 	int i = 0;
			
 
				 	struct sc_hypervisor_wrapper* sc_w;
			
@@ -59,8 +61,9 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 
				 /* 		printf("%d: flops %lf remaining flops %lf ready flops %lf nready_tasks %d\n", */
			
 
				 /* 		       sched_ctxs[i], flops[i], sc_w->remaining_flops/1000000000, sc_w->ready_flops/1000000000, sc_w->nready_tasks); */
			
 
				 	}
			
 
				-
			
 
				-	double vmax = 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
			
 
				+		
			
 
				+	
			
 
				+	double vmax = 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw, sched_ctxs);
			
 
				 	double optimal_v = 0.0;
			
 
				 	for(i = 0; i < nsched_ctxs; i++)
			
 
				 	{
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -465,6 +465,7 @@ unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker)
 
				 	struct sc_hypervisor_policy_config *config = sc_w->config;
			
 
				 	if(config != NULL)
			
 
				 	{
			
 
				+		printf("w%d/ctx%d: current idle %lf max_idle %lf\n", worker, sched_ctx, sc_w->current_idle_time[worker], config->max_idle[worker]);
			
 
				 		if(sc_w->current_idle_time[worker] > config->max_idle[worker])
			
 
				 		{
			
 
				 			sc_w->current_idle_time[worker] = 0.0;
			
--- a/sc_hypervisor/src/sc_config.c
+++ b/sc_hypervisor/src/sc_config.c
@@ -74,7 +74,7 @@ void _add_config(unsigned sched_ctx)
 
				 {
			
 
				 	struct sc_hypervisor_policy_config *config = _create_config();
			
 
				 	config->min_nworkers = 0;
			
 
				-	config->max_nworkers = STARPU_NMAXWORKERS;
			
 
				+	config->max_nworkers = starpu_worker_get_count();
			
 
				 	config->new_workers_max_idle = MAX_IDLE_TIME;
			
 
				 
			
 
				 	int i;
			
--- a/sc_hypervisor/src/sc_hypervisor.c
+++ b/sc_hypervisor/src/sc_hypervisor.c
@@ -458,6 +458,7 @@ static void _reset_idle_time(unsigned sched_ctx)
 
				 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				 	{
			
 
				 		hypervisor.sched_ctx_w[sched_ctx].idle_time[i] = 0.0;
			
 
				+		hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] = hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] != 0.0 ? starpu_timing_now() : 0.0;
			
 
				 	}
			
 
				 	return;
			
 
				 }
			
@@ -473,6 +474,12 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
				 		sender_sc_w->start_time = start_time;
			
 
				 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				 		_reset_idle_time(sender_sched_ctx);
			
 
				+		int i;
			
 
				+		for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+		{
			
 
				+			sender_sc_w->idle_start_time[i] = 0.0;
			
 
				+		}
			
 
				+		
			
 
				 	}
			
 
				 
			
 
				 	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
@@ -483,6 +490,12 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
				 		receiver_sc_w->start_time = start_time;
			
 
				 		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
 
				 		_reset_idle_time(receiver_sched_ctx);
			
 
				+		int i;
			
 
				+		for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+		{
			
 
				+			receiver_sc_w->idle_start_time[i] = (hypervisor.sched_ctx_w[receiver_sched_ctx].idle_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
			
 
				+		}
			
 
				+
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -763,6 +776,115 @@ void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *work
 
				 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
			
 
				 }
			
 
				 
			
 
				+void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
			
 
				+{
			
 
				+	unsigned sched_ctx;
			
 
				+	int total_max_nworkers = 0;
			
 
				+	int max_cpus = starpu_cpu_worker_get_count();
			
 
				+	double max_workers_idle_time[nsched_ctxs];
			
 
				+	unsigned configured = 0;
			
 
				+	int i;
			
 
				+	for(i = 0; i < nsched_ctxs; i++)
			
 
				+	{
			
 
				+		sched_ctx = sched_ctxs[i];
			
 
				+
			
 
				+		if(hypervisor.sched_ctx_w[sched_ctx].to_be_sized) continue;
			
 
				+
			
 
				+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
			
 
				+		struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
			
 
				+		int worker;
			
 
				+		
			
 
				+		struct starpu_sched_ctx_iterator it;
			
 
				+		if(workers->init_iterator)
			
 
				+			workers->init_iterator(workers, &it);
			
 
				+		
			
 
				+		max_workers_idle_time[i] = 0.0;
			
 
				+		while(workers->has_next(workers, &it))
			
 
				+		{
			
 
				+			worker = workers->get_next(workers, &it);
			
 
				+			if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]==0.0)
			
 
				+			{
			
 
				+				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				double end_time  = starpu_timing_now();
			
 
				+				double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
			
 
				+				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
			
 
				+			}				
			
 
				+		}			
			
 
				+
			
 
				+		
			
 
				+		double curr_time = starpu_timing_now();
			
 
				+		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
			
 
				+		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
			
 
				+
			
 
				+		config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time) + hypervisor.sched_ctx_w[sched_ctx].nready_tasks;
			
 
				+		
			
 
				+		if(config->max_nworkers < 0)
			
 
				+			config->max_nworkers = 0;
			
 
				+		if(config->max_nworkers > max_cpus)
			
 
				+			config->max_nworkers = max_cpus;
			
 
				+		
			
 
				+		printf("%d: ready tasks  %d idle for long %lf norm_idle_time %lf elapsed_time %lf nworkers %d max %d \n", 
			
 
				+		       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers);
			
 
				+
			
 
				+/* 		if(max_workers_idle_time[i] > 0.000002) */
			
 
				+/* 		{ */
			
 
				+/* 			double curr_time = starpu_timing_now(); */
			
 
				+/* 			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				+/* 			double norm_idle_time = max_workers_idle_time[i] / elapsed_time; */
			
 
				+
			
 
				+/* 			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time); */
			
 
				+/* 			if(config->max_nworkers < 0) */
			
 
				+/* 				config->max_nworkers = 0; */
			
 
				+			
			
 
				+/* 			printf("%d: ready tasks  %d idle for long %lf norm_idle_time %lf elapsed_time %lf nworkers %d decr %d \n",  */
			
 
				+/* 			       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers); */
			
 
				+
			
 
				+/* 		} */
			
 
				+/* 		else */
			
 
				+/* 		{ */
			
 
				+/* 			double curr_time = starpu_timing_now(); */
			
 
				+/* 			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				+/* 			double norm_idle_time = max_workers_idle_time[i] / elapsed_time; */
			
 
				+			
			
 
				+/* 			if(workers->nworkers == 0 && hypervisor.sched_ctx_w[sched_ctx].nready_tasks == 1) */
			
 
				+/* 				config->max_nworkers = 0; */
			
 
				+/* 			else */
			
 
				+/* 			{ */
			
 
				+/* 				config->max_nworkers = (hypervisor.sched_ctx_w[sched_ctx].nready_tasks > max_cpus)  */
			
 
				+/* 					? max_cpus : hypervisor.sched_ctx_w[sched_ctx].nready_tasks; */
			
 
				+/* 				config->max_nworkers = workers->nworkers > config->max_nworkers ? workers->nworkers : config->max_nworkers; */
			
 
				+/* 			} */
			
 
				+/* 			printf("%d: ready tasks  %d not idle %lf norm_idle_time %lf elapsed_time %lf nworkers %d incr %d \n",  */
			
 
				+/* 			       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, workers->nworkers, config->max_nworkers); */
			
 
				+/* 		} */
			
 
				+
			
 
				+		total_max_nworkers += config->max_nworkers;
			
 
				+		configured = 1;
			
 
				+	}
			
 
				+
			
 
				+	/*if the sum of the max cpus is smaller than the total cpus available 
			
 
				+	  increase the max for the ones having more ready tasks to exec */
			
 
				+	if(configured && total_max_nworkers < max_cpus)
			
 
				+	{
			
 
				+		int diff = max_cpus - total_max_nworkers;
			
 
				+		int max_nready = -1;
			
 
				+		unsigned max_nready_sched_ctx = sched_ctxs[0];
			
 
				+		for(i = 0; i < nsched_ctxs; i++)
			
 
				+		{
			
 
				+			if(max_nready < hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks)
			
 
				+			{
			
 
				+				max_nready = hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks;
			
 
				+				max_nready_sched_ctx = sched_ctxs[i];
			
 
				+			}
			
 
				+		}
			
 
				+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(max_nready_sched_ctx);
			
 
				+		config->max_nworkers += diff;
			
 
				+		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
			
 
				+	}
			
 
				+}
			
 
				 /* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
			
 
				 static void notify_idle_end(unsigned sched_ctx, int worker)
			
 
				 {
			
@@ -777,7 +899,7 @@ static void notify_idle_end(unsigned sched_ctx, int worker)
 
				 		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
			
 
				 		sc_w->idle_start_time[worker] = 0.0;
			
 
				 	}
			
 
				-
			
 
				+			
			
 
				 	if(hypervisor.policy.handle_idle_end)
			
 
				 		hypervisor.policy.handle_idle_end(sched_ctx, worker);
			
 
				 
			
@@ -793,7 +915,7 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 
				 
			
 
				 		if(sc_w->idle_start_time[worker] == 0.0)
			
 
				 			sc_w->idle_start_time[worker] = starpu_timing_now();
			
 
				-
			
 
				+		
			
 
				 		if(hypervisor.policy.handle_idle_cycle)
			
 
				 		{
			
 
				 			hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
			
@@ -842,6 +964,31 @@ static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task
 
				 		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
			
 
				 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 
			
 
				+/* 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx); */
			
 
				+	
			
 
				+/* 	unsigned finished_sample = 0; */
			
 
				+/* 	char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA"); */
			
 
				+/* 	if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0)) */
			
 
				+/* 	{ */
			
 
				+
			
 
				+/* 		double curr_time = starpu_timing_now(); */
			
 
				+/* 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				+
			
 
				+/* 		finished_sample = elapsed_time > config->time_sample; */
			
 
				+/* 	} */
			
 
				+/* 	else */
			
 
				+/* 	{ */
			
 
				+/* 		double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]); */
			
 
				+/* 		double ctx_sample = config->ispeed_ctx_sample; */
			
 
				+
			
 
				+/* 		finished_sample = ctx_elapsed_flops > ctx_sample; */
			
 
				+/* 	} */
			
 
				+
			
 
				+/* 	if(finished_sample) */
			
 
				+/* 	{ */
			
 
				+/* 		sc_hypervisor_update_resize_interval(sched_ctx); */
			
 
				+/* 	} */
			
 
				+	
			
 
				 	if(hypervisor.resize[sched_ctx])
			
 
				 	{	
			
 
				 		if(hypervisor.policy.handle_poped_task)
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -126,7 +126,7 @@ noinst_HEADERS = 						\
 
				 	debug/traces/starpu_fxt.h				\
			
 
				 	profiling/bound.h					\
			
 
				 	profiling/profiling.h					\
			
 
				-	util/starpu_insert_task_utils.h				\
			
 
				+	util/starpu_task_insert_utils.h				\
			
 
				 	util/starpu_data_cpy.h					\
			
 
				 	util/starpu_task_list_inline.h				\
			
 
				 	starpu_parameters.h					\
			
@@ -223,8 +223,8 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 
				 	util/file.c						\
			
 
				 	util/misc.c						\
			
 
				 	util/starpu_data_cpy.c					\
			
 
				-	util/starpu_insert_task.c				\
			
 
				-	util/starpu_insert_task_utils.c				\
			
 
				+	util/starpu_task_insert.c				\
			
 
				+	util/starpu_task_insert_utils.c				\
			
 
				 	util/starpu_inlines.c					\
			
 
				 	debug/traces/starpu_fxt.c				\
			
 
				 	debug/traces/starpu_fxt_mpi.c				\
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -400,11 +400,11 @@ do {										\
 
				 /* We skip these events becasue they are called so often that they cause FxT to
			
 
				  * fail and make the overall trace unreadable anyway. */
			
 
				 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
			
 
				-	do {} while (0);
			
 
				+	do {} while (0)
			
 
				 //	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
			
 
				-	do {} while (0);
			
 
				+	do {} while (0)
			
 
				 	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, _starpu_gettid());
			
 
				 	
			
 
				 #define _STARPU_TRACE_USER_EVENT(code)			\
			
@@ -418,80 +418,146 @@ do {										\
 
				 
			
 
				 #ifdef STARPU_FXT_LOCK_TRACES 
			
 
				 
			
 
				-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)	\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_MUTEX,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_LOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_MUTEX,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_UNLOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_MUTEX,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RDLOCKING_RWLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_RDLOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_WRLOCKING_RWLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_WRLOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_RWLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_UNLOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_SPINLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_LOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_SPINLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_UNLOCKED,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_SPINLOCK,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_BEGIN,line,_starpu_gettid(),file);
			
 
				-
			
 
				-#define _STARPU_TRACE_COND_WAIT_END(file,line)		\
			
 
				-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_END,line,_starpu_gettid(),file);
			
 
				+#define _STARPU_TRACE_LOCKING_MUTEX()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
			
 
				+} while (0)
			
 
				+
			
 
				+#define _STARPU_TRACE_MUTEX_LOCKED()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_LOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_UNLOCKING_MUTEX()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_MUTEX_UNLOCKED()	do {\
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_UNLOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_TRYLOCK_MUTEX()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_MUTEX,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_RDLOCKING_RWLOCK()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RDLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_RWLOCK_RDLOCKED()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_RDLOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_WRLOCKING_RWLOCK()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_WRLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_RWLOCK_WRLOCKED()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_WRLOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_UNLOCKING_RWLOCK()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_RWLOCK_UNLOCKED()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define STARPU_TRACE_SPINLOCK_CONDITITION (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER)
			
 
				+
			
 
				+#define _STARPU_TRACE_LOCKING_SPINLOCK()	do {\
			
 
				+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
			
 
				+		const char *file; \
			
 
				+		file = strrchr(__FILE__,'/') + 1; \
			
 
				+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_SPINLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+	} \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do { \
			
 
				+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
			
 
				+		const char *file; \
			
 
				+		file = strrchr(__FILE__,'/') + 1; \
			
 
				+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_LOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+	} \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do { \
			
 
				+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
			
 
				+		const char *file; \
			
 
				+		file = strrchr(__FILE__,'/') + 1; \
			
 
				+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_SPINLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+	} \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()	do { \
			
 
				+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
			
 
				+		const char *file; \
			
 
				+		file = strrchr(__FILE__,'/') + 1; \
			
 
				+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
			
 
				+	} \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()	do { \
			
 
				+	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
			
 
				+		const char *file; \
			
 
				+		file = strrchr(__FILE__,'/') + 1; \
			
 
				+		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_SPINLOCK,__LINE__,_starpu_gettid(),file); \
			
 
				+	} \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_COND_WAIT_BEGIN()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				+
			
 
				+#define _STARPU_TRACE_COND_WAIT_END()	do { \
			
 
				+	const char *file; \
			
 
				+	file = strrchr(__FILE__,'/') + 1; \
			
 
				+	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_END,__LINE__,_starpu_gettid(),file); \
			
 
				+} while(0)
			
 
				 
			
 
				 #else // !STARPU_FXT_LOCK_TRACES
			
 
				 
			
 
				-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_END(file,line)			do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_SPINLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do {} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
			
 
				 
			
 
				 #endif // STARPU_FXT_LOCK_TRACES
			
 
				 
			
@@ -544,24 +610,24 @@ do {										\
 
				 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
			
 
				 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
			
 
				-#define _STARPU_TRACE_LOCKING_MUTEX(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_LOCKED(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_MUTEX(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_MUTEX(file,line)			do {} while(0)
			
 
				-#define _STARPU_TRACE_RDLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_RDLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_WRLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_WRLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_RWLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_LOCKING_SPINLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_LOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file,line)	do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_BEGIN(file,line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_END(file,line)			do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_SPINLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_LOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_SPINLOCK()	do {} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_SPINLOCK()		do {} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
			
 
				 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
			
 
				 
			
 
				 #endif // STARPU_USE_FXT
			
--- a/src/common/starpu_spinlock.h
+++ b/src/common/starpu_spinlock.h
@@ -54,55 +54,29 @@ int _starpu_spin_destroy(struct _starpu_spinlock *lock);
 
				 
			
 
				 int _starpu_spin_lock(struct _starpu_spinlock *lock);
			
 
				 #define _starpu_spin_lock(lock) ({ \
			
 
				-	const char *file;   \
			
 
				-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
			
 
				-	{ \
			
 
				-		file = strrchr(__FILE__,'/'); \
			
 
				-		file += sizeof(char);\
			
 
				-		_STARPU_TRACE_LOCKING_SPINLOCK(file,__LINE__); \
			
 
				-	}\
			
 
				+	_STARPU_TRACE_LOCKING_SPINLOCK(); \
			
 
				 	_starpu_spin_lock(lock); \
			
 
				-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
			
 
				-	{ \
			
 
				-		file = strrchr(__FILE__,'/'); \
			
 
				-		file += sizeof(char);\
			
 
				-		_STARPU_TRACE_SPINLOCK_LOCKED(file,__LINE__); \
			
 
				-	}\
			
 
				+	_STARPU_TRACE_SPINLOCK_LOCKED(); \
			
 
				 	STARPU_RECORD_LOCK(lock); \
			
 
				 	0; \
			
 
				 }) 
			
 
				 
			
 
				 int _starpu_spin_trylock(struct _starpu_spinlock *lock);
			
 
				 #define _starpu_spin_trylock(lock) ({ \
			
 
				-	const char *file;   \
			
 
				-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
			
 
				-	{ \
			
 
				-		file = strrchr(__FILE__,'/'); \
			
 
				-		file += sizeof(char);\
			
 
				-		_STARPU_TRACE_TRYLOCK_SPINLOCK(file,__LINE__); \
			
 
				-	}\
			
 
				+	_STARPU_TRACE_TRYLOCK_SPINLOCK(); \
			
 
				 	int err = _starpu_spin_trylock(lock); \
			
 
				-	if (!err) \
			
 
				+	if (!err) { \
			
 
				 		STARPU_RECORD_LOCK(lock); \
			
 
				+		_STARPU_TRACE_SPINLOCK_LOCKED(); \
			
 
				+	} \
			
 
				 	err; \
			
 
				 })
			
 
				 int _starpu_spin_checklocked(struct _starpu_spinlock *lock);
			
 
				 int _starpu_spin_unlock(struct _starpu_spinlock *lock);
			
 
				 #define _starpu_spin_unlock(lock) ({ \
			
 
				-	const char *file;   \
			
 
				-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
			
 
				-	{ \
			
 
				-		file = strrchr(__FILE__,'/'); \
			
 
				-		file += sizeof(char);\
			
 
				-		_STARPU_TRACE_UNLOCKING_SPINLOCK(file,__LINE__); \
			
 
				-	}\
			
 
				+	_STARPU_TRACE_UNLOCKING_SPINLOCK(); \
			
 
				 	_starpu_spin_unlock(lock); \
			
 
				-	if (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER) \
			
 
				-	{ \
			
 
				-		file = strrchr(__FILE__,'/'); \
			
 
				-		file += sizeof(char);\
			
 
				-		_STARPU_TRACE_SPINLOCK_UNLOCKED(file,__LINE__); \
			
 
				-	}\
			
 
				+	_STARPU_TRACE_SPINLOCK_UNLOCKED(); \
			
 
				 	0; \
			
 
				 }) 
			
 
				 
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -84,43 +84,36 @@ int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
 
				 
			
 
				 int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-
			
 
				-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
			
 
				+	_STARPU_TRACE_LOCKING_MUTEX();
			
 
				 
			
 
				 	if (!*mutex) STARPU_PTHREAD_MUTEX_INIT(mutex, NULL);
			
 
				 
			
 
				 	xbt_mutex_acquire(*mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_MUTEX_LOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_MUTEX_LOCKED();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_UNLOCKING_MUTEX(file,__LINE__);
			
 
				+	_STARPU_TRACE_UNLOCKING_MUTEX();
			
 
				 
			
 
				 	xbt_mutex_release(*mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_MUTEX_UNLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_MUTEX_UNLOCKED();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_TRYLOCK_MUTEX(file,__LINE__);
			
 
				+	_STARPU_TRACE_TRYLOCK_MUTEX();
			
 
				 
			
 
				 	xbt_mutex_acquire(*mutex);
			
 
				+
			
 
				+	_STARPU_TRACE_MUTEX_LOCKED();
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -185,16 +178,13 @@ int starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
 
				 
			
 
				 int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_COND_WAIT_BEGIN(file,__LINE__);
			
 
				+	_STARPU_TRACE_COND_WAIT_BEGIN();
			
 
				 
			
 
				 	if (!*cond)
			
 
				 		STARPU_PTHREAD_COND_INIT(cond, NULL);
			
 
				 	xbt_cond_wait(*cond, *mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_COND_WAIT_END(file,__LINE__);
			
 
				+	_STARPU_TRACE_COND_WAIT_END();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -218,42 +208,33 @@ int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
 
				 
			
 
				 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_RDLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_RDLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = starpu_pthread_mutex_lock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_RDLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_RDLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_WRLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_WRLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = starpu_pthread_mutex_lock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_WRLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_WRLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_UNLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_UNLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = starpu_pthread_mutex_unlock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_UNLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_UNLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
@@ -262,94 +243,79 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
				 
			
 
				 int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
			
 
				+	_STARPU_TRACE_LOCKING_MUTEX();
			
 
				 
			
 
				 	int p_ret = pthread_mutex_lock(mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_MUTEX_LOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_MUTEX_LOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_UNLOCKING_MUTEX(file,__LINE__);
			
 
				+	_STARPU_TRACE_UNLOCKING_MUTEX();
			
 
				 
			
 
				 	int p_ret = pthread_mutex_unlock(mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_MUTEX_UNLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_MUTEX_UNLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char *file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_LOCKING_MUTEX(file,__LINE__);
			
 
				+	int ret;
			
 
				+	_STARPU_TRACE_TRYLOCK_MUTEX();
			
 
				+
			
 
				+	ret = pthread_mutex_trylock(mutex);
			
 
				+
			
 
				+	if (!ret)
			
 
				+		_STARPU_TRACE_MUTEX_LOCKED();
			
 
				 
			
 
				-	return pthread_mutex_trylock(mutex);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_COND_WAIT_BEGIN(file,__LINE__);
			
 
				+	_STARPU_TRACE_COND_WAIT_BEGIN();
			
 
				 
			
 
				  	int p_ret = pthread_cond_wait(cond, mutex);
			
 
				 
			
 
				-	_STARPU_TRACE_COND_WAIT_END(file,__LINE__);
			
 
				+	_STARPU_TRACE_COND_WAIT_END();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_RDLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_RDLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = pthread_rwlock_rdlock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_RDLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_RDLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_WRLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_WRLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = pthread_rwlock_wrlock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_WRLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_WRLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 {
			
 
				-	const char* file;
			
 
				-	file = strrchr(__FILE__,'/');
			
 
				-	file += sizeof(char);
			
 
				-	_STARPU_TRACE_UNLOCKING_RWLOCK(file,__LINE__);
			
 
				+	_STARPU_TRACE_UNLOCKING_RWLOCK();
			
 
				 
			
 
				  	int p_ret = pthread_rwlock_unlock(rwlock);
			
 
				 
			
 
				-	_STARPU_TRACE_RWLOCK_UNLOCKED(file,__LINE__);
			
 
				+	_STARPU_TRACE_RWLOCK_UNLOCKED();
			
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -25,23 +25,14 @@
 
				 #include <stdlib.h>
			
 
				 #include <math.h>
			
 
				 #include <pthread.h>
			
 
				+#ifdef STARPU_HAVE_SCHED_YIELD
			
 
				+#include <sched.h>
			
 
				+#endif
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HELGRIND_H
			
 
				 #include <valgrind/helgrind.h>
			
 
				 #endif
			
 
				 
			
 
				-#ifndef VALGRIND_HG_MUTEX_LOCK_PRE
			
 
				-#define VALGRIND_HG_MUTEX_LOCK_PRE(mutex, istrylock) ((void)0)
			
 
				-#endif
			
 
				-#ifndef VALGRIND_HG_MUTEX_LOCK_POST
			
 
				-#define VALGRIND_HG_MUTEX_LOCK_POST(mutex) ((void)0)
			
 
				-#endif
			
 
				-#ifndef VALGRIND_HG_MUTEX_UNLOCK_PRE
			
 
				-#define VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex) ((void)0)
			
 
				-#endif
			
 
				-#ifndef VALGRIND_HG_MUTEX_UNLOCK_POST
			
 
				-#define VALGRIND_HG_MUTEX_UNLOCK_POST(mutex) ((void)0)
			
 
				-#endif
			
 
				 #ifndef DO_CREQ_v_WW
			
 
				 #define DO_CREQ_v_WW(_creqF, _ty1F, _arg1F, _ty2F, _arg2F) ((void)0)
			
 
				 #endif
			
@@ -54,25 +45,14 @@
 
				 #ifndef ANNOTATE_HAPPENS_AFTER
			
 
				 #define ANNOTATE_HAPPENS_AFTER(obj) ((void)0)
			
 
				 #endif
			
 
				-#ifndef ANNOTATE_RWLOCK_ACQUIRED
			
 
				-#define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) ((void)0)
			
 
				+#ifndef VALGRIND_HG_DISABLE_CHECKING
			
 
				+#define VALGRIND_HG_DISABLE_CHECKING(start, len) ((void)0)
			
 
				 #endif
			
 
				-#ifndef ANNOTATE_RWLOCK_RELEASED
			
 
				-#define ANNOTATE_RWLOCK_RELEASED(lock, is_w) ((void)0)
			
 
				+#ifndef VALGRIND_HG_ENABLE_CHECKING
			
 
				+#define VALGRIND_HG_ENABLE_CHECKING(start, len) ((void)0)
			
 
				 #endif
			
 
				-
			
 
				-#define _STARPU_VALGRIND_HG_SPIN_LOCK_PRE(lock) \
			
 
				-	DO_CREQ_v_WW(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_PRE, \
			
 
				-			struct _starpu_spinlock *, lock, long, 0)
			
 
				-#define _STARPU_VALGRIND_HG_SPIN_LOCK_POST(lock) \
			
 
				-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_LOCK_POST, \
			
 
				-			struct _starpu_spinlock *, lock)
			
 
				-#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(lock) \
			
 
				-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_PRE, \
			
 
				-			struct _starpu_spinlock *, lock)
			
 
				-#define _STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(lock) \
			
 
				-	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
			
 
				-			struct _starpu_spinlock *, lock)
			
 
				+#define STARPU_HG_DISABLE_CHECKING(variable) VALGRIND_HG_DISABLE_CHECKING(&(variable), sizeof(variable))
			
 
				+#define STARPU_HG_ENABLE_CHECKING(variable)  VALGRIND_HG_ENABLE_CHECKING(&(variable), sizeof(variable))
			
 
				 
			
 
				 #if defined(__KNC__) || defined(__KNF__)
			
 
				 #define STARPU_DEBUG_PREFIX "[starpu-mic]"
			
@@ -80,6 +60,19 @@
 
				 #define STARPU_DEBUG_PREFIX "[starpu]"
			
 
				 #endif
			
 
				 
			
 
				+/* This is needed in some places to make valgrind yield to another thread to be
			
 
				+ * able to progress.  */
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+#define _STARPU_UYIELD() __asm__ __volatile("rep; nop")
			
 
				+#else
			
 
				+#define _STARPU_UYIELD() ((void)0)
			
 
				+#endif
			
 
				+#if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
			
 
				+#define STARPU_UYIELD() do { if (RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
			
 
				+#else
			
 
				+#define STARPU_UYIELD() _STARPU_UYIELD()
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 #  define _STARPU_DEBUG(fmt, ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%s] " fmt ,__starpu_func__ ,## __VA_ARGS__); fflush(stderr); }} while(0)
			
 
				 #else
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -481,8 +481,14 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 
				 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
			
 
				 	unsigned do_submit_tasks = 0;
			
 
				 
			
 
				-	if (handle->post_sync_tasks_cnt > 0)
			
 
				+	/* Here helgrind would shout that this is an unprotected access, but
			
 
				+	 * count can only be zero if we don't have to care about
			
 
				+	 * post_sync_tasks_cnt at all.  */
			
 
				+	unsigned count = handle->post_sync_tasks_cnt;
			
 
				+
			
 
				+	if (count)
			
 
				 	{
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 		if (--handle->post_sync_tasks_cnt == 0)
			
 
				 		{
			
 
				 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
			
@@ -490,6 +496,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 
				 			post_sync_tasks = handle->post_sync_tasks;
			
 
				 			handle->post_sync_tasks = NULL;
			
 
				 		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				 	}
			
 
				 
			
 
				 	if (do_submit_tasks)
			
--- a/src/core/dependencies/task_deps.c
+++ b/src/core/dependencies/task_deps.c
@@ -70,9 +70,9 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 
				 		STARPU_ASSERT_MSG(!job->submitted || !task->destroy || task->detach, "Task dependencies have to be set before submission (submitted %u destroy %d detach %d)", job->submitted, task->destroy, task->detach);
			
 
				 	else
			
 
				 		STARPU_ASSERT_MSG(job->terminated <= 1, "Task dependencies have to be set before termination (terminated %u)", job->terminated);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
			
 
				 
			
 
				 	struct _starpu_cg *cg = create_cg_task(ndeps, job);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < ndeps; i++)
			
--- a/src/core/disk_ops/disk_stdio.c
+++ b/src/core/disk_ops/disk_stdio.c
@@ -55,10 +55,9 @@ starpu_stdio_alloc (void *base, size_t size)
 
				 	int id = -1;
			
 
				 
			
 
				 	/* create template for mkstemp */
			
 
				-	char * baseCpy = malloc(strlen(base)+8);
			
 
				-	STARPU_ASSERT(baseCpy != NULL);
			
 
				-
			
 
				 	char * tmp = "STARPU_XXXXXX";
			
 
				+	char * baseCpy = malloc(strlen(base)+1+strlen(tmp)+1);
			
 
				+	STARPU_ASSERT(baseCpy != NULL);
			
 
				 
			
 
				 	strcpy(baseCpy, (char *) base);
			
 
				 	strcat(baseCpy,"/");
			
@@ -294,7 +293,7 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 
				 	struct timeval end;
			
 
				 	
			
 
				 	srand (time (NULL)); 
			
 
				-	char * buf = malloc(SIZE_DISK_MIN*sizeof(char));
			
 
				+	char * buf = malloc(SIZE_DISK_MIN);
			
 
				 	STARPU_ASSERT(buf != NULL);
			
 
				 	
			
 
				 	/* allocate memory */
			
@@ -304,6 +303,8 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 
				 		return 0;
			
 
				 	struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) mem;
			
 
				 
			
 
				+	memset(buf, 0, SIZE_DISK_MIN);
			
 
				+
			
 
				 	/* Measure upload slowness */
			
 
				 	gettimeofday(&start, NULL);
			
 
				 	for (iter = 0; iter < NITER; ++iter)
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -262,7 +262,8 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 		handle->refcnt--;
			
 
				 		handle->busy_count--;
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				+		if (!_starpu_data_check_not_busy(handle))
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				 		starpu_task_clean(conversion_task);
			
 
				 		free(conversion_task);
			
 
				 	}
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -717,6 +717,7 @@ static void benchmark_all_gpu_devices(void)
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
			
 
				+	hwloc_bitmap_free(former_cpuset);
			
 
				 #elif __linux__
			
 
				 	/* Restore the former affinity */
			
 
				 	ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -255,6 +255,12 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 
				 		{
			
 
				 			entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
			
 
				 			STARPU_ASSERT(entry);
			
 
				+
			
 
				+			/* Tell  helgrind that we do not care about
			
 
				+			 * racing access to the sampling, we only want a
			
 
				+			 * good-enough estimation */
			
 
				+			STARPU_HG_DISABLE_CHECKING(entry->nsample);
			
 
				+			STARPU_HG_DISABLE_CHECKING(entry->mean);
			
 
				 		}
			
 
				 
			
 
				 		scan_history_entry(f, entry);
			
@@ -1210,13 +1216,15 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 		HASH_FIND_UINT32_T(history, &key, entry);
			
 
				 		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				-		/* We do not care about racing access to the mean, we only want a
			
 
				-		 * good-enough estimation, thus simulate taking the rdlock */
			
 
				-		ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
			
 
				+		/* Here helgrind would shout that this is unprotected access.
			
 
				+		 * We do not care about racing access to the mean, we only want
			
 
				+		 * a good-enough estimation */
			
 
				 
			
 
				 		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
 
				 			exp = entry->history_entry->mean;
			
 
				-		else if (!model->benchmarking)
			
 
				+
			
 
				+		STARPU_HG_DISABLE_CHECKING(model->benchmarking);
			
 
				+		if (isnan(exp) && !model->benchmarking)
			
 
				 		{
			
 
				 			char archname[32];
			
 
				 
			
@@ -1225,7 +1233,6 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 			_starpu_set_calibrate_flag(1);
			
 
				 			model->benchmarking = 1;
			
 
				 		}
			
 
				-		ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
			
 
				 	}
			
 
				 
			
 
				 	return exp;
			
@@ -1233,7 +1240,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 
			
 
				 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
			
 
				 {
			
 
				-	double exp;
			
 
				+	double exp = NAN;
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				 	struct starpu_perfmodel_history_entry *entry;
			
 
				 	struct starpu_perfmodel_history_table *history, *elt;
			
@@ -1248,18 +1255,17 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 	entry = (elt == NULL) ? NULL : elt->history_entry;
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				-	/* We do not care about racing access to the mean, we only want a
			
 
				-	 * good-enough estimation, thus simulate taking the rdlock */
			
 
				-	ANNOTATE_RWLOCK_ACQUIRED(&model->model_rwlock, 0);
			
 
				-
			
 
				-	exp = entry?entry->mean:NAN;
			
 
				+	/* Here helgrind would shout that this is unprotected access.
			
 
				+	 * We do not care about racing access to the mean, we only want
			
 
				+	 * a good-enough estimation */
			
 
				 
			
 
				-	if (entry && entry->nsample < _STARPU_CALIBRATION_MINIMUM)
			
 
				+	if (entry && entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
 
				 		/* TODO: report differently if we've scheduled really enough
			
 
				 		 * of that task and the scheduler should perhaps put it aside */
			
 
				-		/* Not calibrated enough */
			
 
				-		exp = NAN;
			
 
				+		/* Calibrated enough */
			
 
				+		exp = entry->mean;
			
 
				 
			
 
				+	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
			
 
				 	if (isnan(exp) && !model->benchmarking)
			
 
				 	{
			
 
				 		char archname[32];
			
@@ -1270,8 +1276,6 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 		model->benchmarking = 1;
			
 
				 	}
			
 
				 
			
 
				-	ANNOTATE_RWLOCK_RELEASED(&model->model_rwlock, 0);
			
 
				-
			
 
				 	return exp;
			
 
				 }
			
 
				 
			
@@ -1310,6 +1314,13 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				/* this is the first entry with such a footprint */
			
 
				 				entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
			
 
				 				STARPU_ASSERT(entry);
			
 
				+
			
 
				+				/* Tell  helgrind that we do not care about
			
 
				+				 * racing access to the sampling, we only want a
			
 
				+				 * good-enough estimation */
			
 
				+				STARPU_HG_DISABLE_CHECKING(entry->nsample);
			
 
				+				STARPU_HG_DISABLE_CHECKING(entry->mean);
			
 
				+
			
 
				 				entry->mean = measured;
			
 
				 				entry->sum = measured;
			
 
				 
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -103,7 +103,8 @@ static struct starpu_sched_policy *find_sched_policy_from_name(const char *polic
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
			
 
				+	if (strcmp(policy_name, "help") != 0)
			
 
				+	     fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
			
 
				 
			
 
				 	/* nothing was found */
			
 
				 	return NULL;
			
@@ -117,12 +118,13 @@ static void display_sched_help_message(void)
 
				 		/* display the description of all predefined policies */
			
 
				 		struct starpu_sched_policy **policy;
			
 
				 
			
 
				-		fprintf(stderr, "STARPU_SCHED can be either of\n");
			
 
				+		fprintf(stderr, "\nThe variable STARPU_SCHED can be set to one of the following strings:\n");
			
 
				 		for(policy=predefined_policies ; *policy!=NULL ; policy++)
			
 
				 		{
			
 
				 			struct starpu_sched_policy *p = *policy;
			
 
				 			fprintf(stderr, "%s\t-> %s\n", p->policy_name, p->policy_description);
			
 
				 		}
			
 
				+		fprintf(stderr, "\n");
			
 
				 	 }
			
 
				 }
			
 
				 
			
@@ -630,18 +632,29 @@ pick:
 
				 				}
			
 
				 			}
			
 
				 
			
 
				-			if(!task && sched_ctx && worker->removed_from_ctx[sched_ctx->id])
			
 
				+			if(!task)
			
 
				 			{
			
 
				-				_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
			
 
				-				worker->removed_from_ctx[sched_ctx->id] = 0;
			
 
				-			}
			
 
				+				if(sched_ctx && worker->removed_from_ctx[sched_ctx->id])
			
 
				+				{
			
 
				+					_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
			
 
				+					worker->removed_from_ctx[sched_ctx->id] = 0;
			
 
				+				} 
			
 
				+#ifdef STARPU_USE_SC_HYPERVISOR
			
 
				+				else 
			
 
				+				{
			
 
				+					struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
			
 
				+					if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle)
			
 
				+						perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
			
 
				+				}
			
 
				+#endif //STARPU_USE_SC_HYPERVISOR
			
 
				+					
			
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				-			if((!task && sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
			
 
				-				break;
			
 
				-
			
 
				-
			
 
				-			been_here[sched_ctx->id] = 1;
			
 
				+				if((sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
			
 
				+					break;
			
 
				+				been_here[sched_ctx->id] = 1;
			
 
				 #endif
			
 
				+			}
			
 
				+			
			
 
				 			sched_ctx->pop_counter[worker->workerid]++;
			
 
				 		}
			
 
				 	  }
			
@@ -650,6 +663,17 @@ pick:
 
				 	if (!task)
			
 
				 		return NULL;
			
 
				 
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_SC_HYPERVISOR
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
			
 
				+
			
 
				+	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_end)
			
 
				+		perf_counters->notify_idle_end(task->sched_ctx, worker->workerid);
			
 
				+#endif //STARPU_USE_SC_HYPERVISOR
			
 
				+
			
 
				+
			
 
				 	/* Make sure we do not bother with all the multiformat-specific code if
			
 
				 	 * it is not necessary. */
			
 
				 	if (!_starpu_task_uses_multiformat_handles(task))
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -160,6 +160,14 @@ void _starpu_task_destroy(struct starpu_task *task)
 
				 		if (task->cl_arg_free)
			
 
				 			free(task->cl_arg);
			
 
				 
			
 
				+		/* Does user want StarPU release callback_arg ? */
			
 
				+		if (task->callback_arg_free)
			
 
				+			free(task->callback_arg);
			
 
				+
			
 
				+		/* Does user want StarPU release prologue_callback_arg ? */
			
 
				+		if (task->prologue_callback_arg_free)
			
 
				+			free(task->prologue_callback_arg);
			
 
				+
			
 
				 		free(task);
			
 
				 	}
			
 
				 }
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -383,7 +383,68 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
				 struct itimerval prof_itimer;
			
 
				 #endif
			
 
				 
			
 
				-void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
			
 
				+static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu_machine_config *pconfig)
			
 
				+{
			
 
				+	workerarg->config = pconfig;
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
			
 
				+	/* arch initialized by topology.c */
			
 
				+	/* worker_mask initialized by topology.c */
			
 
				+	/* perf_arch initialized by topology.c */
			
 
				+	/* worker_thread initialized by _starpu_launch_drivers */
			
 
				+	/* mp_nodeid initialized by topology.c */
			
 
				+	/* devid initialized by topology.c */
			
 
				+	/* bindid initialized by topology.c */
			
 
				+	/* workerid initialized by topology.c */
			
 
				+	workerarg->combined_workerid = workerarg->workerid;
			
 
				+	workerarg->current_rank = 0;
			
 
				+	workerarg->worker_size = 1;
			
 
				+	STARPU_PTHREAD_COND_INIT(&workerarg->started_cond, NULL);
			
 
				+	STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
			
 
				+	/* memory_node initialized by topology.c */
			
 
				+	STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
			
 
				+	starpu_task_list_init(&workerarg->local_tasks);
			
 
				+	workerarg->current_task = NULL;
			
 
				+	workerarg->set = NULL;
			
 
				+
			
 
				+	/* if some codelet's termination cannot be handled directly :
			
 
				+	 * for instance in the Gordon driver, Gordon tasks' callbacks
			
 
				+	 * may be executed by another thread than that of the Gordon
			
 
				+	 * driver so that we cannot call the push_codelet_output method
			
 
				+	 * directly */
			
 
				+	workerarg->terminated_jobs = _starpu_job_list_new();
			
 
				+
			
 
				+	workerarg->worker_is_running = 0;
			
 
				+	workerarg->worker_is_initialized = 0;
			
 
				+	workerarg->status = STATUS_INITIALIZING;
			
 
				+	/* name initialized by driver */
			
 
				+	/* short_name initialized by driver */
			
 
				+	workerarg->run_by_starpu = 1;
			
 
				+
			
 
				+	workerarg->sched_ctx_list = NULL;
			
 
				+	workerarg->nsched_ctxs = 0;
			
 
				+	_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
			
 
				+
			
 
				+	workerarg->has_prev_init = 0;
			
 
				+
			
 
				+	int ctx;
			
 
				+	for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
			
 
				+		workerarg->removed_from_ctx[ctx] = 0;
			
 
				+
			
 
				+	workerarg->spinning_backoff = 1;
			
 
				+
			
 
				+	STARPU_PTHREAD_COND_INIT(&workerarg->parallel_sect_cond, NULL);
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&workerarg->parallel_sect_mutex, NULL);
			
 
				+
			
 
				+	workerarg->parallel_sect = 0;
			
 
				+
			
 
				+	for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
			
 
				+		workerarg->shares_tasks_lists[ctx] = 0;
			
 
				+
			
 
				+	/* cpu_set/hwloc_cpu_set initialized in topology.c */
			
 
				+}
			
 
				+
			
 
				+void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key)
			
 
				 {
			
 
				 	(void) fut_key;
			
 
				 	int devid = worker->devid;
			
@@ -415,7 +476,6 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
				 	worker->worker_is_running = 1;
			
 
				 	STARPU_PTHREAD_COND_SIGNAL(&worker->started_cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
			
 
				-	worker->spinning_backoff = 1;
			
 
				 }
			
 
				 
			
 
				 static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
			
@@ -446,51 +506,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 		unsigned mp_nodeid = workerarg->mp_nodeid;
			
 
				 #endif
			
 
				 
			
 
				-		workerarg->config = pconfig;
			
 
				-
			
 
				-		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
			
 
				-		STARPU_PTHREAD_COND_INIT(&workerarg->started_cond, NULL);
			
 
				-		STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
			
 
				-
			
 
				-		workerarg->worker_size = 1;
			
 
				-		workerarg->combined_workerid = workerarg->workerid;
			
 
				-		workerarg->current_rank = 0;
			
 
				-		workerarg->has_prev_init = 0;
			
 
				-		/* mutex + cond only for the local list */
			
 
				-		/* we have a single local list */
			
 
				-		/* afterwards there would be a mutex + cond for the list of each strategy */
			
 
				-		workerarg->run_by_starpu = 1;
			
 
				-		workerarg->worker_is_running = 0;
			
 
				-		workerarg->worker_is_initialized = 0;
			
 
				-		workerarg->set = NULL;
			
 
				-
			
 
				-		int ctx;
			
 
				-		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
			
 
				-		{
			
 
				-			workerarg->removed_from_ctx[ctx] = 0;
			
 
				-			workerarg->shares_tasks_lists[ctx] = 0;
			
 
				-		}
			
 
				-
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
			
 
				-		STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&workerarg->parallel_sect_mutex, NULL);
			
 
				-		STARPU_PTHREAD_COND_INIT(&workerarg->parallel_sect_cond, NULL);
			
 
				-		workerarg->parallel_sect = 0;
			
 
				-
			
 
				-		/* if some codelet's termination cannot be handled directly :
			
 
				-		 * for instance in the Gordon driver, Gordon tasks' callbacks
			
 
				-		 * may be executed by another thread than that of the Gordon
			
 
				-		 * driver so that we cannot call the push_codelet_output method
			
 
				-		 * directly */
			
 
				-		workerarg->terminated_jobs = _starpu_job_list_new();
			
 
				-
			
 
				-		starpu_task_list_init(&workerarg->local_tasks);
			
 
				-
			
 
				-		workerarg->status = STATUS_INITIALIZING;
			
 
				-
			
 
				 		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
			
 
				 
			
 
				 		_starpu_init_worker_queue(workerarg);
			
@@ -857,6 +872,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 {
			
 
				 	int is_a_sink = 0; /* Always defined. If the MP infrastructure is not
			
 
				 			    * used, we cannot be a sink. */
			
 
				+	unsigned worker;
			
 
				 #ifdef STARPU_USE_MP
			
 
				 	_starpu_set_argc_argv(argc, argv);
			
 
				 
			
@@ -884,6 +900,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 	_STARPU_DISP("Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
			
 
				 #endif
			
 
				 #endif
			
 
				+#ifdef STARPU_SPINLOCK_CHECK
			
 
				+	_STARPU_DISP("Warning: StarPU was configured with --enable-spinlock-check, which slows down a bit\n");
			
 
				+#endif
			
 
				 #if 0
			
 
				 #ifndef STARPU_NO_ASSERT
			
 
				 	_STARPU_DISP("Warning: StarPU was configured without --enable-fast\n");
			
@@ -1005,6 +1024,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 	 * threads */
			
 
				 	_starpu_initialize_current_task_key();
			
 
				 
			
 
				+	for (worker = 0; worker < config.topology.nworkers; worker++)
			
 
				+		_starpu_worker_init(&config.workers[worker], &config);
			
 
				+
			
 
				 	if (!is_a_sink)
			
 
				 	{
			
 
				 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf->sched_policy_name);
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -52,6 +52,7 @@
 
				 
			
 
				 #include <starpu_parameters.h>
			
 
				 
			
 
				+/* This is initialized from in _starpu_worker_init */
			
 
				 struct _starpu_worker
			
 
				 {
			
 
				 	struct _starpu_machine_config *config;
			
@@ -341,7 +342,7 @@ void _starpu_block_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthr
 
				 void _starpu_set_local_worker_key(struct _starpu_worker *worker);
			
 
				 
			
 
				 /* This function initializes the current thread for the given worker */
			
 
				-void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key);
			
 
				+void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key);
			
 
				 
			
 
				 /* Returns the _starpu_worker structure that describes the state of the
			
 
				  * current worker. */
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -178,6 +178,7 @@ struct _starpu_data_state
 
				 	unsigned long last_submitted_ghost_sync_id;
			
 
				 	struct _starpu_jobid_list *last_submitted_ghost_accessors_id;
			
 
				 
			
 
				+	/* protected by sequential_consistency_mutex */
			
 
				 	struct _starpu_task_wrapper_list *post_sync_tasks;
			
 
				 	unsigned post_sync_tasks_cnt;
			
 
				 
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -20,6 +20,13 @@
 
				 #include <common/utils.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				 
			
 
				+/* TODO: This should be tuned according to driver capabilities
			
 
				+ * Data interfaces should also have to declare how many asynchronous requests
			
 
				+ * they have actually started (think of e.g. csr).
			
 
				+ */
			
 
				+#define MAX_PENDING_REQUESTS_PER_NODE 400
			
 
				+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 200
			
 
				+
			
 
				 /* requests that have not been treated at all */
			
 
				 static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
			
 
				 static struct _starpu_data_request_list *prefetch_requests[STARPU_MAXNODES];
			
@@ -27,6 +34,7 @@ static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
				 
			
 
				 /* requests that are not terminated (eg. async transfers) */
			
 
				 static struct _starpu_data_request_list *data_requests_pending[STARPU_MAXNODES];
			
 
				+static unsigned data_requests_npending[STARPU_MAXNODES];
			
 
				 static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
			
 
				 
			
 
				 void _starpu_init_data_request_lists(void)
			
@@ -36,11 +44,19 @@ void _starpu_init_data_request_lists(void)
 
				 	{
			
 
				 		prefetch_requests[i] = _starpu_data_request_list_new();
			
 
				 		data_requests[i] = _starpu_data_request_list_new();
			
 
				+
			
 
				+		/* Tell helgrind that we are fine with checking for list_empty
			
 
				+		 * in _starpu_handle_node_data_requests, we will call it
			
 
				+		 * periodically anyway */
			
 
				+		STARPU_HG_DISABLE_CHECKING(data_requests[i]->_head);
			
 
				+
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
			
 
				 
			
 
				 		data_requests_pending[i] = _starpu_data_request_list_new();
			
 
				+		data_requests_npending[i] = 0;
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
			
 
				 	}
			
 
				+	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
			
 
				 }
			
 
				 
			
 
				 void _starpu_deinit_data_request_lists(void)
			
@@ -380,6 +396,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				 		_starpu_data_request_list_push_front(data_requests_pending[r->handling_node], r);
			
 
				+		data_requests_npending[r->handling_node]++;
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				 
			
 
				 		return -EAGAIN;
			
@@ -397,18 +414,11 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
				 	struct _starpu_data_request *r;
			
 
				 	struct _starpu_data_request_list *new_data_requests;
			
 
				 
			
 
				-	/* Note: we here tell valgrind that list_empty (reading a pointer) is
			
 
				-	 * as safe as if we had the lock held */
			
 
				-	VALGRIND_HG_MUTEX_LOCK_PRE(&data_requests_list_mutex[src_node], 0);
			
 
				-	VALGRIND_HG_MUTEX_LOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				+	/* Here helgrind would should that this is an un protected access.
			
 
				+	 * We however don't care about missing an entry, we will get called
			
 
				+	 * again sooner or later. */
			
 
				 	if (_starpu_data_request_list_empty(data_requests[src_node]))
			
 
				-	{
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				 		return;
			
 
				-	}
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data_requests_list_mutex[src_node]);
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data_requests_list_mutex[src_node]);
			
 
				 
			
 
				 	/* take all the entries from the request list */
			
 
				         STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
@@ -427,6 +437,7 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
				 	 * requests, and we handle the request(s) one by one in the former
			
 
				 	 * list, without concurrency issues.*/
			
 
				 	data_requests[src_node] = _starpu_data_request_list_new();
			
 
				+	STARPU_HG_DISABLE_CHECKING(data_requests[src_node]->_head);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				 
			
@@ -437,15 +448,29 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
				 	{
			
 
				                 int res;
			
 
				 
			
 
				+		if (data_requests_npending[src_node] >= MAX_PENDING_REQUESTS_PER_NODE)
			
 
				+		{
			
 
				+			/* Too many requests at the same time, skip pushing
			
 
				+			 * more for now */
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				 		r = _starpu_data_request_list_pop_front(local_list);
			
 
				 
			
 
				 		res = starpu_handle_data_request(r, may_alloc);
			
 
				 		if (res == -ENOMEM)
			
 
				 		{
			
 
				 			_starpu_data_request_list_push_back(new_data_requests, r);
			
 
				+			break;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	while (!_starpu_data_request_list_empty(local_list))
			
 
				+	{
			
 
				+		r = _starpu_data_request_list_pop_front(local_list);
			
 
				+		_starpu_data_request_list_push_back(new_data_requests, r);
			
 
				+	}
			
 
				+
			
 
				 	if (!_starpu_data_request_list_empty(new_data_requests))
			
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
@@ -493,6 +518,13 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 
				 	{
			
 
				                 int res;
			
 
				 
			
 
				+		if (data_requests_npending[src_node] >= MAX_PENDING_PREFETCH_REQUESTS_PER_NODE)
			
 
				+		{
			
 
				+			/* Too many requests at the same time, skip pushing
			
 
				+			 * more for now */
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				 		r = _starpu_data_request_list_pop_front(local_list);
			
 
				 
			
 
				 		res = starpu_handle_data_request(r, may_alloc);
			
@@ -539,6 +571,7 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
				 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
			
 
				 //
			
 
				 	struct _starpu_data_request_list *new_data_requests_pending;
			
 
				+	unsigned taken, kept;
			
 
				 
			
 
				 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
			
 
				 		return;
			
@@ -558,11 +591,14 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				 
			
 
				 	new_data_requests_pending = _starpu_data_request_list_new();
			
 
				+	taken = 0;
			
 
				+	kept = 0;
			
 
				 
			
 
				 	while (!_starpu_data_request_list_empty(local_list))
			
 
				 	{
			
 
				 		struct _starpu_data_request *r;
			
 
				 		r = _starpu_data_request_list_pop_front(local_list);
			
 
				+		taken++;
			
 
				 
			
 
				 		starpu_data_handle_t handle = r->handle;
			
 
				 
			
@@ -592,15 +628,15 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 				_starpu_data_request_list_push_back(new_data_requests_pending, r);
			
 
				+				kept++;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	if (!_starpu_data_request_list_empty(new_data_requests_pending))
			
 
				-	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	data_requests_npending[src_node] -= taken - kept;
			
 
				+	if (kept)
			
 
				 		_starpu_data_request_list_push_list_back(data_requests_pending[src_node], new_data_requests_pending);
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				-	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				 
			
 
				 	_starpu_data_request_list_delete(local_list);
			
 
				 	_starpu_data_request_list_delete(new_data_requests_pending);
			
@@ -626,7 +662,7 @@ int _starpu_check_that_no_data_request_exists(unsigned node)
 
				 	no_request = _starpu_data_request_list_empty(data_requests[node]);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
			
 
				-	no_pending = _starpu_data_request_list_empty(data_requests_pending[node]);
			
 
				+	no_pending = !data_requests_npending[node];
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
			
 
				 
			
 
				 	return (no_request && no_pending);
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -191,6 +191,8 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 		child->last_sync_task = NULL;
			
 
				 		child->last_submitted_accessors = NULL;
			
 
				 		child->post_sync_tasks = NULL;
			
 
				+		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
			
 
				+		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);
			
 
				 		child->post_sync_tasks_cnt = 0;
			
 
				 
			
 
				 		/* The methods used for reduction are propagated to the
			
@@ -274,6 +276,8 @@ void _starpu_empty_codelet_function(void *buffers[], void *args)
 
				 void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gathering_node)
			
 
				 {
			
 
				 	unsigned child;
			
 
				+	unsigned worker;
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				 	unsigned node;
			
 
				 	unsigned sizes[root_handle->nchildren];
			
 
				 
			
@@ -315,16 +319,34 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 		}
			
 
				 
			
 
				 		int ret;
			
 
				-		ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
			
 
				 		/* for now we pretend that the RAM is almost unlimited and that gathering
			
 
				 		 * data should be possible from the node that does the unpartionning ... we
			
 
				 		 * don't want to have the programming deal with memory shortage at that time,
			
 
				 		 * really */
			
 
				+		if (child_handle->current_mode == STARPU_REDUX)
			
 
				+		{
			
 
				+			/* Acquire the child data on the gathering node. This will trigger collapsing the reduction */
			
 
				+			ret = starpu_data_acquire_on_node(child_handle, gathering_node, STARPU_RW);
			
 
				+			_starpu_unlock_post_sync_tasks(child_handle);
			
 
				+		} else
			
 
				+		{
			
 
				+			/* Simply transfer any pending data */
			
 
				+			ret = _starpu_fetch_data_on_node(child_handle, &child_handle->per_node[gathering_node], STARPU_R, 0, 0, NULL, NULL);
			
 
				+		}
			
 
				 		STARPU_ASSERT(ret == 0);
			
 
				 
			
 
				 		_starpu_spin_lock(&child_handle->header_lock);
			
 
				 
			
 
				 		_starpu_data_free_interfaces(child_handle);
			
 
				+
			
 
				+		for (worker = 0; worker < nworkers; worker++)
			
 
				+		{
			
 
				+			struct _starpu_data_replicate *local = &child_handle->per_worker[worker];
			
 
				+			STARPU_ASSERT(local->state == STARPU_INVALID);
			
 
				+			if (local->allocated && local->automatically_allocated)
			
 
				+				_starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]);
			
 
				+		}
			
 
				+
			
 
				 		_starpu_memory_stats_free(child_handle);
			
 
				 		_starpu_data_requester_list_delete(child_handle->req_list);
			
 
				 		_starpu_data_requester_list_delete(child_handle->reduction_req_list);
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -200,6 +200,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 	handle->last_sync_task = NULL;
			
 
				 	handle->last_submitted_accessors = NULL;
			
 
				 	handle->post_sync_tasks = NULL;
			
 
				+
			
 
				+	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
			
 
				+	STARPU_HG_DISABLE_CHECKING(handle->post_sync_tasks_cnt);
			
 
				 	handle->post_sync_tasks_cnt = 0;
			
 
				 
			
 
				 	/* By default, there are no methods available to perform a reduction */
			
@@ -292,6 +295,10 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 
				 	unsigned node;
			
 
				 	unsigned worker;
			
 
				 
			
 
				+	/* Tell helgrind that our access to busy_count in
			
 
				+	 * starpu_data_unregister is actually safe */
			
 
				+	STARPU_HG_DISABLE_CHECKING(handle->busy_count);
			
 
				+
			
 
				 	handle->ops = interface_ops;
			
 
				 	handle->mf_node = mf_node;
			
 
				 
			
@@ -421,7 +428,8 @@ int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 
				 	entry = (struct handle_tag_entry *) malloc(sizeof(*entry));
			
 
				 	STARPU_ASSERT(entry != NULL);
			
 
				 
			
 
				-	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),"data handle %p already has tag %d\n", starpu_data_get_data_handle_from_tag(tag), tag);
			
 
				+	STARPU_ASSERT_MSG(!(starpu_data_get_data_handle_from_tag(tag)),
			
 
				+			  "There is already a data handle %p registered with the tag %d\n", starpu_data_get_data_handle_from_tag(tag), tag);
			
 
				 
			
 
				 	entry->tag = tag;
			
 
				 	entry->handle = handle;
			
@@ -677,14 +685,11 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
			
 
				 	while (1) {
			
 
				 		int busy;
			
 
				-		/* Note: we here tell valgrind that reading busy_count is as
			
 
				-		 * safe is if we had the lock held */
			
 
				-		_STARPU_VALGRIND_HG_SPIN_LOCK_PRE(&handle->header_lock);
			
 
				-		_STARPU_VALGRIND_HG_SPIN_LOCK_POST(&handle->header_lock);
			
 
				-		busy = handle->busy_count;
			
 
				-		_STARPU_VALGRIND_HG_SPIN_UNLOCK_PRE(&handle->header_lock);
			
 
				-		_STARPU_VALGRIND_HG_SPIN_UNLOCK_POST(&handle->header_lock);
			
 
				-		if (!busy)
			
 
				+		/* Here helgrind would shout that this an unprotected access,
			
 
				+		 * but this is actually fine: all threads who do busy_count--
			
 
				+		 * are supposed to call _starpu_data_check_not_busy, which will
			
 
				+		 * wake us up through the busy_mutex/busy_cond. */
			
 
				+		if (!handle->busy_count)
			
 
				 			break;
			
 
				 		/* This is woken by _starpu_data_check_not_busy, always called
			
 
				 		 * after decrementing busy_count */
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -230,11 +230,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				 	/* unless asynchronous, it is forbidden to call this function from a callback or a codelet */
			
 
				-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				-	{
			
 
				-                _STARPU_LOG_OUT_TAG("EDEADLK");
			
 
				-		return -EDEADLK;
			
 
				-        }
			
 
				+	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "Acquiring a data synchronously is not possible from a codelet or from a task callback, use starpu_data_acquire_cb instead.");
			
 
				 
			
 
				 	if (_starpu_data_is_multiformat_handle(handle) &&
			
 
				 	    _starpu_handle_needs_conversion_task(handle, 0))
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -238,7 +238,7 @@ int _starpu_cpu_driver_init(struct starpu_driver *d)
 
				 
			
 
				 	int devid = cpu_worker->devid;
			
 
				 
			
 
				-	_starpu_worker_init(cpu_worker, _STARPU_FUT_CPU_KEY);
			
 
				+	_starpu_worker_start(cpu_worker, _STARPU_FUT_CPU_KEY);
			
 
				 	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
			
 
				 	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -391,7 +391,7 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 
				 	STARPU_ASSERT(args);
			
 
				 	unsigned devid = args->devid;
			
 
				 
			
 
				-	_starpu_worker_init(args, _STARPU_FUT_CUDA_KEY);
			
 
				+	_starpu_worker_start(args, _STARPU_FUT_CUDA_KEY);
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	init_context(devid);
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -241,38 +241,11 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-#ifdef STARPU_USE_SC_HYPERVISOR
			
 
				-		struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				-		struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
			
 
				-		struct _starpu_sched_ctx_list *l = NULL;
			
 
				-		for (l = args->sched_ctx_list; l; l = l->next)
			
 
				-		{
			
 
				-			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				-			if(sched_ctx->id != 0)
			
 
				-			{
			
 
				-				perf_counters = sched_ctx->perf_counters;
			
 
				-				if(perf_counters != NULL && perf_counters->notify_idle_cycle)
			
 
				-				{
			
 
				-					perf_counters->notify_idle_cycle(sched_ctx->id, args->workerid, 1.0);
			
 
				-					
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-#endif //STARPU_USE_SC_HYPERVISOR
			
 
				-
			
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
			
 
				 
			
 
				-#ifdef STARPU_USE_SC_HYPERVISOR
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				-	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
			
 
				-
			
 
				-	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_end)
			
 
				-		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
			
 
				-#endif //STARPU_USE_SC_HYPERVISOR
			
 
				-
			
 
				 	_starpu_worker_set_status_wakeup(workerid);
			
 
				 	args->spinning_backoff = BACKOFF_MIN;
			
 
				 
			
--- a/src/drivers/mic/driver_mic_source.c
+++ b/src/drivers/mic/driver_mic_source.c
@@ -517,7 +517,7 @@ void *_starpu_mic_src_worker(void *arg)
 
				 
			
 
				 	/* unsigned memnode = baseworker->memory_node; */
			
 
				 
			
 
				-	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
			
 
				+	_starpu_worker_start(baseworker, _STARPU_FUT_MIC_KEY);
			
 
				 
			
 
				 	// Current task for a thread managing a worker set has no sense.
			
 
				 	_starpu_set_current_task(NULL);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -599,7 +599,7 @@ int _starpu_opencl_driver_init(struct starpu_driver *d)
 
				 	STARPU_ASSERT(args);
			
 
				 	int devid = args->devid;
			
 
				 
			
 
				-	_starpu_worker_init(args, _STARPU_FUT_OPENCL_KEY);
			
 
				+	_starpu_worker_start(args, _STARPU_FUT_OPENCL_KEY);
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	_starpu_opencl_init_context(devid);
			
--- a/src/drivers/scc/driver_scc_source.c
+++ b/src/drivers/scc/driver_scc_source.c
@@ -291,7 +291,7 @@ void *_starpu_scc_src_worker(void *arg)
 
				 	unsigned mp_nodeid = args->mp_nodeid;
			
 
				 	unsigned i;
			
 
				 
			
 
				-	_starpu_worker_init(args, _STARPU_FUT_SCC_KEY);
			
 
				+	_starpu_worker_start(args, _STARPU_FUT_SCC_KEY);
			
 
				 
			
 
				 	_starpu_scc_src_init_context(devid);
			
 
				 
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -42,6 +42,11 @@ static void initialize_eager_center_policy(unsigned sched_ctx_id)
 
				 	/* there is only a single queue in that trivial design */
			
 
				 	data->fifo =  _starpu_create_fifo();
			
 
				 
			
 
				+	 /* Tell helgrind that it's fine to check for empty fifo in
			
 
				+	  * pop_task_eager_policy without actual mutex (it's just an integer)
			
 
				+	  */
			
 
				+	STARPU_HG_DISABLE_CHECKING(data->fifo->ntasks);
			
 
				+
			
 
				 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
			
 
				 }
			
@@ -117,19 +122,12 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	struct starpu_task *task = NULL;
			
 
				 
			
 
				-	/* Tell helgrind that it's fine to check for empty fifo without actual
			
 
				-	 * mutex (it's just a pointer) */
			
 
				-	VALGRIND_HG_MUTEX_LOCK_PRE(&data->policy_mutex, 0);
			
 
				-	VALGRIND_HG_MUTEX_LOCK_POST(&data->policy_mutex);
			
 
				 	/* block until some event happens */
			
 
				+	/* Here helgrind would shout that this is unprotected, this is just an
			
 
				+	 * integer access, and we hold the sched mutex, so we can not miss any
			
 
				+	 * wake up. */
			
 
				 	if (_starpu_fifo_empty(data->fifo))
			
 
				-	{
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
			
 
				 		return NULL;
			
 
				-	}
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
			
 
				 	task = _starpu_fifo_pop_task(data->fifo, workerid);
			
@@ -147,6 +145,12 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 
				 	{
			
 
				 		workerid = workerids[i];
			
 
				 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				+
			
 
				+		starpu_pthread_mutex_t *sched_mutex;
			
 
				+		starpu_pthread_cond_t *sched_cond;
			
 
				+		starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
			
 
				+
			
 
				+		starpu_wakeup_worker(workerid, sched_cond, sched_mutex);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -84,6 +84,11 @@ static void initialize_eager_center_priority_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	/* only a single queue (even though there are several internaly) */
			
 
				 	data->taskq = _starpu_create_priority_taskq();
			
 
				+
			
 
				+	/* Tell helgrind that it's fine to check for empty fifo in
			
 
				+	 * _starpu_priority_pop_task without actual mutex (it's just an
			
 
				+	 * integer) */
			
 
				+	STARPU_HG_DISABLE_CHECKING(data->taskq->total_ntasks);
			
 
				 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL);
			
 
				 
			
@@ -154,19 +159,12 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
				 
			
 
				 	struct _starpu_priority_taskq *taskq = data->taskq;
			
 
				 
			
 
				-	/* Tell helgrind that it's fine to check for empty fifo without actual
			
 
				-	 * mutex (it's just a pointer) */
			
 
				-	VALGRIND_HG_MUTEX_LOCK_PRE(&data->policy_mutex, 0);
			
 
				-	VALGRIND_HG_MUTEX_LOCK_POST(&data->policy_mutex);
			
 
				 	/* block until some event happens */
			
 
				+	/* Here helgrind would shout that this is unprotected, this is just an
			
 
				+	 * integer access, and we hold the sched mutex, so we can not miss any
			
 
				+	 * wake up. */
			
 
				 	if (taskq->total_ntasks == 0)
			
 
				-	{
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
			
 
				 		return NULL;
			
 
				-	}
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_PRE(&data->policy_mutex);
			
 
				-	VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
			
 
				 
			
 
				 	/* release this mutex before trying to wake up other workers */
			
 
				 	starpu_pthread_mutex_t *curr_sched_mutex;
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -112,14 +112,14 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				 	if (!starpu_worker_is_combined_worker(best_workerid))
			
 
				 	{
			
 
				-		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
			
 
				-		/* TODO */
			
 
				-		task->predicted_transfer = 0;
			
 
				 		starpu_pthread_mutex_t *sched_mutex;
			
 
				 		starpu_pthread_cond_t *sched_cond;
			
 
				 		starpu_worker_get_sched_condition(best_workerid, &sched_mutex, &sched_cond);
			
 
				 
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
			
 
				+		/* TODO */
			
 
				+		task->predicted_transfer = 0;
			
 
				 		worker_exp_len[best_workerid] += task->predicted;
			
 
				 		worker_exp_end[best_workerid] = exp_end_predicted;
			
 
				 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
			
@@ -196,11 +196,10 @@ static double compute_expected_end(int workerid, double length)
 
				 		double res;
			
 
				 		/* This is a basic worker */
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
			
 
				+		/* Here helgrind would shout that this is unprotected, but we
			
 
				+		 * are fine with getting outdated values, this is just an
			
 
				+		 * estimation */
			
 
				 		res = worker_exp_start[workerid] + worker_exp_len[workerid] + length;
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
			
 
				 
			
 
				 		return res;
			
 
				 	}
			
@@ -213,9 +212,9 @@ static double compute_expected_end(int workerid, double length)
 
				 
			
 
				 		double exp_end = DBL_MIN;
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
			
 
				-
			
 
				+		/* Here helgrind would shout that this is unprotected, but we
			
 
				+		 * are fine with getting outdated values, this is just an
			
 
				+		 * estimation */
			
 
				 		int i;
			
 
				 		for (i = 0; i < worker_size; i++)
			
 
				 		{
			
@@ -225,9 +224,6 @@ static double compute_expected_end(int workerid, double length)
 
				 			exp_end = STARPU_MAX(exp_end, local_exp_end);
			
 
				 		}
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
			
 
				-
			
 
				 		return exp_end;
			
 
				 	}
			
 
				 }
			
@@ -245,11 +241,10 @@ static double compute_ntasks_end(int workerid)
 
				 		double res;
			
 
				 		/* This is a basic worker */
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
			
 
				+		/* Here helgrind would shout that this is unprotected, but we
			
 
				+		 * are fine with getting outdated values, this is just an
			
 
				+		 * estimation */
			
 
				 		res = ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
			
 
				 
			
 
				 		return res;
			
 
				 	}
			
@@ -262,9 +257,9 @@ static double compute_ntasks_end(int workerid)
 
				 
			
 
				 		int ntasks_end=0;
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_LOCK_PRE(sched_mutex, 0);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_POST(sched_mutex);
			
 
				-
			
 
				+		/* Here helgrind would shout that this is unprotected, but we
			
 
				+		 * are fine with getting outdated values, this is just an
			
 
				+		 * estimation */
			
 
				 		int i;
			
 
				 		for (i = 0; i < worker_size; i++)
			
 
				 		{
			
@@ -272,9 +267,6 @@ static double compute_ntasks_end(int workerid)
 
				 			ntasks_end = STARPU_MAX(ntasks_end, (int) ((double) ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch)));
			
 
				 		}
			
 
				 
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(sched_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(sched_mutex);
			
 
				-
			
 
				 		return ntasks_end;
			
 
				 	}
			
 
				 }
			
@@ -580,6 +572,11 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&hd->global_push_mutex, NULL);
			
 
				 
			
 
				+	/* Tell helgrind that we are fine with getting outdated values when
			
 
				+	 * estimating schedules */
			
 
				+	STARPU_HG_DISABLE_CHECKING(worker_exp_start);
			
 
				+	STARPU_HG_DISABLE_CHECKING(worker_exp_len);
			
 
				+	STARPU_HG_DISABLE_CHECKING(ntasks);
			
 
				 }
			
 
				 
			
 
				 static void parallel_heft_deinit(unsigned sched_ctx_id)
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -73,11 +73,10 @@ static unsigned select_victim_round_robin(unsigned sched_ctx_id)
 
				 		unsigned njobs;
			
 
				 
			
 
				 		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_PRE(victim_sched_mutex, 0);
			
 
				-		VALGRIND_HG_MUTEX_LOCK_POST(victim_sched_mutex);
			
 
				+		/* Here helgrind would shout that this is unprotected, but we
			
 
				+		 * are fine with getting outdated values, this is just an
			
 
				+		 * estimation */
			
 
				 		njobs = ws->queue_array[worker]->njobs;
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_PRE(victim_sched_mutex);
			
 
				-		VALGRIND_HG_MUTEX_UNLOCK_POST(victim_sched_mutex);
			
 
				 
			
 
				 		if (njobs)
			
 
				 			break;
			
@@ -402,6 +401,11 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 
				 		workerid = workerids[i];
			
 
				 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				 		ws->queue_array[workerid] = _starpu_create_deque();
			
 
				+
			
 
				+		/* Tell helgrid that we are fine with getting outdated values,
			
 
				+		 * this is just an estimation */
			
 
				+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->njobs);
			
 
				+
			
 
				 		/**
			
 
				 		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
			
 
				 		 * we need to initialize it at -1.
			
--- a/src/util/starpu_insert_task.c
+++ b/src/util/starpu_insert_task.c
@@ -21,7 +21,7 @@
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 #include <stdarg.h>
			
 
				-#include <util/starpu_insert_task_utils.h>
			
 
				+#include <util/starpu_task_insert_utils.h>
			
 
				 
			
 
				 void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
			
 
				 {
			
@@ -29,7 +29,7 @@ void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
 
				 
			
 
				 	/* Compute the size */
			
 
				 	va_start(varg_list, arg_buffer_size);
			
 
				-	*arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
			
 
				+	*arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
			
 
				 
			
 
				 	va_start(varg_list, arg_buffer_size);
			
 
				 	_starpu_codelet_pack_args(arg_buffer, *arg_buffer_size, varg_list);
			
@@ -62,7 +62,7 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 
				 	va_end(varg_list);
			
 
				 }
			
 
				 
			
 
				-int starpu_insert_task(struct starpu_codelet *cl, ...)
			
 
				+int starpu_task_insert(struct starpu_codelet *cl, ...)
			
 
				 {
			
 
				 	va_list varg_list;
			
 
				 	void *arg_buffer = NULL;
			
@@ -70,7 +70,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 	/* Compute the size */
			
 
				 	size_t arg_buffer_size = 0;
			
 
				 	va_start(varg_list, cl);
			
 
				-	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
			
 
				+	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
			
 
				 
			
 
				 	if (arg_buffer_size)
			
 
				 	{
			
@@ -87,7 +87,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 	}
			
 
				 
			
 
				 	va_start(varg_list, cl);
			
 
				-	int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
			
 
				+	int ret = _starpu_task_insert_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
			
 
				 
			
 
				 	if (ret == -ENODEV)
			
 
				 	{
			
@@ -96,3 +96,31 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
			
 
				+{
			
 
				+	va_list varg_list;
			
 
				+	void *arg_buffer = NULL;
			
 
				+
			
 
				+	/* Compute the size */
			
 
				+	size_t arg_buffer_size = 0;
			
 
				+	va_start(varg_list, cl);
			
 
				+	arg_buffer_size = _starpu_task_insert_get_arg_size(varg_list);
			
 
				+
			
 
				+	if (arg_buffer_size)
			
 
				+	{
			
 
				+		va_start(varg_list, cl);
			
 
				+		_starpu_codelet_pack_args(&arg_buffer, arg_buffer_size, varg_list);
			
 
				+	}
			
 
				+
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
			
 
				+	{
			
 
				+		task->dyn_handles = malloc(cl->nbuffers * sizeof(starpu_data_handle_t));
			
 
				+	}
			
 
				+
			
 
				+	va_start(varg_list, cl);
			
 
				+	_starpu_task_insert_create(arg_buffer, arg_buffer_size, cl, &task, varg_list);
			
 
				+	return task;
			
 
				+}
			
--- a/src/util/starpu_insert_task_utils.c
+++ b/src/util/starpu_insert_task_utils.c