%!s(int64=8) %!d(string=hai) anos · f9c20a61bb
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,12 @@ New features:
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
 
				     time, that can later be recalled through starpu_sched_ctx_get_user_data().
			
 
				-  * Add STARPU_SIMGRID_TASK_SUBMIT_COST to simulate the cost of task submission
			
 
				-    in simgrid mode. This provides more accurate simgrid predictions, especially
			
 
				-    for the beginning of the execution.
			
 
				+  * Add STARPU_SIMGRID_TASK_SUBMIT_COST and STARPU_SIMGRID_FETCHING_INPUT_COST
			
 
				+    to simulate the cost of task submission and data fetching in simgrid mode.
			
 
				+    This provides more accurate simgrid predictions, especially for the
			
 
				+    beginning of the execution and regarding data transfers.
			
 
				+  * STARPU_SIMGRID_SCHED_COST to take into account the time to perform scheduling
			
 
				+    when running in SimGrid mode.
			
 
				   * New configure option --enable-mpi-pedantic-isend (disabled by
			
 
				     default) to acquire data in STARPU_RW (instead of STARPU_R) before
			
 
				     performing MPI_Isend call
			
@@ -47,8 +50,21 @@ Changes:
 
				 Small changes:
			
 
				   * Use asynchronous transfers for task data fetches with were not prefetched.
			
 
				 
			
 
				-StarPU 1.2.1 (svn revision xxx)
			
 
				+StarPU 1.2.2 (svn revision xxx)
			
 
				 ==============================================
			
 
				+
			
 
				+New features:
			
 
				+  * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
			
 
				+  * Add NVCC_CC environment variable.
			
 
				+  * Add -no-foo options to starpu_fxt_tool to make traces lighter
			
 
				+
			
 
				+Small changes:
			
 
				+  * Output generated through STARPU_MPI_COMM has been modified to
			
 
				+    allow easier automated checking
			
 
				+
			
 
				+StarPU 1.2.1 (svn revision 20299)
			
 
				+==============================================
			
 
				+
			
 
				 New features:
			
 
				   * Add starpu_fxt_trace_user_event_string.
			
 
				   * Add starpu_tasks_rec_complete tool to add estimation times in tasks.rec
			
@@ -61,10 +77,27 @@ New features:
 
				     models between devices, making calibration much faster.
			
 
				   * Add modular-heft-prio scheduler.
			
 
				   * Add starpu_cublas_get_local_handle helper.
			
 
				+  * Add starpu_data_set_name, starpu_data_set_coordinates_array, and
			
 
				+    starpu_data_set_coordinates to describe data, and starpu_iteration_push and
			
 
				+    starpu_iteration_pop to describe tasks, for better offline traces analysis.
			
 
				+  * New function starpu_bus_print_filenames() to display filenames
			
 
				+    storing bandwidth/affinity/latency information, available through
			
 
				+    tools/starpu_machine_display -i
			
 
				+  * Add support for Ayudame version 2.x debugging library.
			
 
				+  * Add starpu_sched_ctx_get_workers_list_raw, much less costly than
			
 
				+    starpu_sched_ctx_get_workers_list
			
 
				+  * Add starpu_task_get_name and use it to warn about dmda etc. using
			
 
				+    a dumb policy when calibration is not finished
			
 
				+  * MPI: Add functions to test for cached values
			
 
				 
			
 
				 Changes:
			
 
				   * Fix performance regression of lws for small tasks.
			
 
				+  * Improve native Fortran support for StarPU
			
 
				 
			
 
				+Small changes:
			
 
				+  * Fix type of data home node to allow users to pass -1 to define
			
 
				+    temporary data
			
 
				+  * Fix compatibility with simgrid 3.14
			
 
				 
			
 
				 StarPU 1.2.0 (svn revision 18521)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -631,6 +631,8 @@ then
 
				     INCLUDE_PTHREAD_H='#include <pthread.h>'
			
 
				 fi
			
 
				 
			
 
				+AC_CHECK_HEADERS([unistd.h], [AC_DEFINE([STARPU_HAVE_UNISTD_H], [1], [Define to 1 if you have the <unistd.h> header file.])])
			
 
				+
			
 
				 AC_CHECK_TYPE([struct timespec],
			
 
				 	       AC_DEFINE(STARPU_HAVE_STRUCT_TIMESPEC,[1],[struct timespec is defined]),
			
 
				 	       [], [
			
@@ -643,8 +645,6 @@ AC_CHECK_TYPE([struct timespec],
 
				 $INCLUDE_PTHREAD_H
			
 
				 ])
			
 
				 
			
 
				-AC_CHECK_HEADERS([unistd.h], [AC_DEFINE([STARPU_HAVE_UNISTD_H], [1], [Define to 1 if you have the <unistd.h> header file.])])
			
 
				-
			
 
				 if test x"$enable_native_winthreads" = xyes
			
 
				 then
			
 
				     CPPFLAGS+=" -I$STARPU_SRC_DIR/include/pthread_win32 "
			
@@ -1033,13 +1033,8 @@ AC_DEFUN([STARPU_CHECK_CUDA],
 
				 	    rm -f cuda_test*
			
 
				 	fi
			
 
				 
			
 
				-	# nvcc is a wrapper around GCC, and calls it with the -dumpspecs
			
 
				-	# option, which is GCC specific. If $CC does not support -dumpspecs, we
			
 
				-	# should let nvcc choose another compiler (by default, gcc, if it is
			
 
				-	# installed). If gcc is not installed, the build will probably fail.
			
 
				-	$CC -dumpspecs >/dev/null 2>&1
			
 
				-	if test $? -eq 0 -a x$starpu_windows != xyes; then
			
 
				-	    NVCCFLAGS="${NVCCFLAGS} -ccbin \${CC}"
			
 
				+	if test -n "$NVCC_CC"; then
			
 
				+	    NVCCFLAGS="${NVCCFLAGS} -ccbin \${NVCC_CC}"
			
 
				 	fi
			
 
				 	if test "$__cuda_include_dir" != "no"; then
			
 
				 	    STARPU_CUDA_CPPFLAGS="-I$__cuda_include_dir"
			
@@ -1127,7 +1122,6 @@ if test x$enable_cuda = xyes; then
 
				 	#in case this is a 64bit setup, we tell nvcc to use a -m64 flag
			
 
				 	if test x$SIZEOF_VOID_P = x8; then
			
 
				 		NVCCFLAGS="${NVCCFLAGS} -m64"
			
 
				-		AC_SUBST(NVCCFLAGS)
			
 
				 	fi
			
 
				 
			
 
				 	AC_CHECK_HEADERS([cuda_gl_interop.h])
			
@@ -1197,6 +1191,7 @@ if test x$enable_cuda = xyes; then
 
				 fi
			
 
				 
			
 
				 AC_ARG_VAR([NVCC], [CUDA compiler])
			
 
				+AC_ARG_VAR([NVCC_CC], [C compiler for CUDA compiler])
			
 
				 AC_ARG_VAR([NVCCFLAGS], [CUDA compiler flags])
			
 
				 
			
 
				 ###############################################################################
			
@@ -1927,12 +1922,6 @@ if test x$use_fxt = xyes; then
 
				 		FXT_LIBS="$(pkg-config --variable=libdir fxt)/libfxt.a -Wl,--as-needed $(pkg-config --libs --static fxt) -Wl,--no-as-needed"
			
 
				 	fi
			
 
				 
			
 
				-        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
			
 
				-			[enable details about codelets in the paje trace])],
			
 
				-			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
			
 
				-        if  test x$enable_paje_codelet_details = xyes; then
			
 
				-        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
			
 
				-        fi
			
 
				 	##########################################
			
 
				 	# Poti is a library to generate paje trace files
			
 
				 	##########################################
			
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012, 2016 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -263,7 +263,7 @@ double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start
 
				 /* How long was the task execution ? */
			
 
				 double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				 
			
 
				-/* We don't need the task structure anymore */
			
 
				+/* We no longer need the task structure */
			
 
				 starpu_task_destroy(task);
			
 
				 \endcode
			
 
				 
			
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -122,10 +122,6 @@ $ vite paje.trace
 
				 
			
 
				 To get names of tasks instead of "unknown", fill the optional
			
 
				 starpu_codelet::name, or use a performance model for them.
			
 
				-Details of the codelet execution can be obtained by passing
			
 
				-\ref enable-paje-codelet-details "--enable-paje-codelet-details" when
			
 
				-configuring StarPU and using a recent enough version of ViTE (at least
			
 
				-r1430).
			
 
				 
			
 
				 In the MPI execution case, \ref STARPU_GENERATE_TRACE will not work as expected
			
 
				 (each node will try to generate paje.trace, thus mixing outputs...), you have to
			
@@ -139,16 +135,27 @@ $ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
 
				 By default, all tasks are displayed using a green color. To display tasks with
			
 
				 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
			
 
				 
			
 
				+By default, the trace contains all informations. To reduce the trace size,
			
 
				+various <c>-no-foo</c> options can be passed to <c>starpu_fxt_tool</c>, see
			
 
				+<c>starpu_fxt_tool --help</c> .
			
 
				+
			
 
				 To identify tasks precisely, the application can set the starpu_task::tag_id field of the
			
 
				-task (or use \ref STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
			
 
				-enough version of ViTE (>= r1430) and the
			
 
				-\ref enable-paje-codelet-details "--enable-paje-codelet-details"
			
 
				-StarPU configure option, the value of the tag will show up in the trace.
			
 
				+task (or use \ref STARPU_TAG_ONLY when using starpu_task_insert()), and
			
 
				+the value of the tag will show up in the trace.
			
 
				 
			
 
				 It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME)
			
 
				 when using starpu_task_insert()), to replace in traces the name of the codelet
			
 
				 with an arbitrarily chosen name.
			
 
				 
			
 
				+It can also set the iteration number, by just calling starpu_iteration_push()
			
 
				+at the beginning of submission loops and starpu_iteration_pop() at the end of
			
 
				+submission loops. These iteration numbers will show up in traces for all tasks
			
 
				+submitted from there.
			
 
				+
			
 
				+Coordinates can also be given to data with the starpu_data_set_coordinates() or
			
 
				+starpu_data_set_coordinates_array() function. In the trace, tasks will then be
			
 
				+assigned the coordinates of the first data they write to.
			
 
				+
			
 
				 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
			
 
				 
			
 
				 \verbatim
			
--- a/doc/doxygen/chapters/390_faq.doxy
+++ b/doc/doxygen/chapters/390_faq.doxy
@@ -344,4 +344,12 @@ If that program does not find your device, the problem is not at the StarPU
 
				 level, but the OpenCL drivers, check the documentation of your OpenCL
			
 
				 implementation.
			
 
				 
			
 
				+\section IncorrectPerformanceModelFile I keep getting a "Incorrect performance model file" error
			
 
				+
			
 
				+The performance model file, used by StarPU to record the performance of
			
 
				+codelets, seem to have been corrupted. Perhaps a previous run of StarPU stopped
			
 
				+abruptly, and thus could not save it properly.  You can have a look at the file
			
 
				+if you can fix it, but the simplest way is to just remove the file and run
			
 
				+again, StarPU will just have to re-perform calibration for the corresponding codelet.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -130,7 +130,9 @@ int main(int argc, char **argv)
 
				     int rank, size;
			
 
				 
			
 
				     starpu_init(NULL);
			
 
				-    starpu_mpi_initialize_extended(&rank, &size);
			
 
				+    starpu_mpi_init(&argc, &argv, 1);
			
 
				+    starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+    starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
 
				     starpu_vector_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				 
			
@@ -140,7 +142,8 @@ int main(int argc, char **argv)
 
				     unsigned last_loop = nloops - 1;
			
 
				     unsigned last_rank = size - 1;
			
 
				 
			
 
				-    for (loop = 0; loop < nloops; loop++) {
			
 
				+    for (loop = 0; loop < nloops; loop++)
			
 
				+    {
			
 
				         int tag = loop*size + rank;
			
 
				 
			
 
				         if (loop == 0 && rank == 0)
			
@@ -188,6 +191,17 @@ and starpu_mpi_isend_detached(), which just submit the communication to be
 
				 performed. The only remaining synchronization with starpu_data_acquire() is at
			
 
				 the beginning and the end.
			
 
				 
			
 
				+\section MPIInitialization How to Initialize StarPU-MPI
			
 
				+
			
 
				+As seen in the previous example, one has to call starpu_mpi_init() to
			
 
				+initialize StarPU-MPI. The third parameter of the function indicates
			
 
				+if MPI should be initialized by StarPU or if the application will do
			
 
				+it itself. If the application initializes MPI itself, it must call
			
 
				+<c>MPI_Init_thread()</c> with <c>MPI_THREAD_SERIALIZED</c> or
			
 
				+<c>MPI_THREAD_MULTIPLE</c>, since StarPU-MPI uses a separate thread to
			
 
				+perform the communications. <c>MPI_THREAD_MULTIPLE</c> is necessary if
			
 
				+the application also performs some MPI communications.
			
 
				+
			
 
				 \section PointToPointCommunication Point To Point Communication
			
 
				 
			
 
				 The standard point to point communications of MPI have been
			
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -9,7 +9,7 @@
 
				 /*! \page SimGridSupport SimGrid Support
			
 
				 
			
 
				 StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				-platform. This was tested with simgrid 3.11, 3.12 and 3.13, other versions may have
			
 
				+platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
			
 
				 compatibility issues.
			
 
				 
			
 
				 \section Preparing Preparing Your Application For Simulation
			
@@ -38,6 +38,8 @@ express that there is a CUDA implementation, even if one does not actually
 
				 provide it. StarPU will not actually run it in Simgrid mode anyway by default
			
 
				 (unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
			
 
				 
			
 
				+\snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				+
			
 
				 \section Calibration Calibration
			
 
				 
			
 
				 The idea is to first compile StarPU normally, and run the application,
			
@@ -141,8 +143,8 @@ be extended as well), change the available GPU memory size, PCI memory bandwidth
 
				 The simulation can be tweaked, to be able to tune it between a very accurate
			
 
				 simulation and a very simple simulation (which is thus close to scheduling
			
 
				 theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST,
			
 
				-\ref STARPU_SIMGRID_CUDA_QUEUE_COST and \ref STARPU_SIMGRID_TASK_SUBMIT_COST
			
 
				-environment variables.
			
 
				+\ref STARPU_SIMGRID_CUDA_QUEUE_COST, \ref STARPU_SIMGRID_TASK_SUBMIT_COST,
			
 
				+\ref STARPU_SIMGRID_FETCHING_INPUT_COST and STARPU_SIMGRID_SCHED_COST environment variables.
			
 
				 
			
 
				 \section SimulationMPIApplications MPI Applications
			
 
				 
			
@@ -166,8 +168,6 @@ application running with simgrid, pass the <c>--cfg=contexts/factory:thread</c>
 
				 option to the application, to make simgrid use system threads, which gdb will be
			
 
				 able to manipulate as usual.
			
 
				 
			
 
				-\snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				-
			
 
				 \section SimulationMemoryUsage Memory Usage
			
 
				 
			
 
				 Since kernels are not actually run and data transfers are not actually
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -629,6 +629,24 @@ account in simgrid mode. This provides more accurate simgrid predictions,
 
				 especially for the beginning of the execution.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_SIMGRID_FETCHING_INPUT_COST</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_SIMGRID_FETCHING_INPUT_COST
			
 
				+\addindex __env__STARPU_SIMGRID_FETCHING_INPUT_COST
			
 
				+When set to 1 (which is the default), fetching input costs are taken into
			
 
				+account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				+especially regarding data transfers.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_SIMGRID_SCHED_COST</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_SIMGRID_SCHED_COST
			
 
				+\addindex __env__STARPU_SIMGRID_SCHED_COST
			
 
				+When set to 1 (which is the default), scheduling costs are taken into
			
 
				+account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				+and allows studying scheduling overhead of the runtime system.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section MiscellaneousAndDebug Miscellaneous And Debug
			
@@ -661,7 +679,7 @@ This specifies the main directory in which StarPU stores its
 
				 performance model files. The default is <c>$STARPU_HOME/.starpu/sampling</c>.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_PERF_MODEL_HOMEGENEOUS_CUDA</dt>
			
 
				+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_CUDA</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
			
 
				 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
			
@@ -671,7 +689,7 @@ calibration to be much faster, since measurements only have to be once for all
 
				 CUDA GPUs.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_PERF_MODEL_HOMEGENEOUS_OPENCL</dt>
			
 
				+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
			
 
				 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
			
@@ -681,7 +699,7 @@ calibration to be much faster, since measurements only have to be once for all
 
				 OPENCL GPUs.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_PERF_MODEL_HOMEGENEOUS_MIC</dt>
			
 
				+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_MIC</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_MIC
			
 
				 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MIC
			
@@ -701,7 +719,7 @@ calibration to be much faster, since measurements only have to be once for all
 
				 MPI Slaves.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_PERF_MODEL_HOMEGENEOUS_SCC</dt>
			
 
				+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_SCC</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_SCC
			
 
				 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_SCC
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -113,8 +113,9 @@ machine which does not have the tools <c>doxygen</c> and <c>latex</c>
 
				 
			
 
				 Additionally, the script <c>configure</c> recognize many variables, which
			
 
				 can be listed by typing <c>./configure --help</c>. For example,
			
 
				-<c>./configure NVCCFLAGS="-arch sm_13"</c> adds a flag for the compilation of
			
 
				-CUDA kernels.
			
 
				+<c>./configure NVCCFLAGS="-arch sm_20"</c> adds a flag for the compilation of
			
 
				+CUDA kernels, and <c>NVCC_CC=gcc-5</c> allows to change the C++ compiler
			
 
				+used by nvcc.
			
 
				 
			
 
				 
			
 
				 \section ConfiguringWorkers Configuring Workers
			
@@ -444,14 +445,6 @@ Enable performance debugging through gprof.
 
				 Enable performance model debugging.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>--enable-paje-codelet-details</dt>
			
 
				-<dd>
			
 
				-\anchor enable-paje-codelet-details
			
 
				-\addindex __configure__--enable-paje-codelet-details
			
 
				-Enable details about codelets in the paje trace. This requires a recent enough
			
 
				-version of ViTE (at least r1430).
			
 
				-</dd>
			
 
				-
			
 
				 <dt>--enable-fxt-lock</dt>
			
 
				 <dd>
			
 
				 \anchor enable-fxt-lock
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -951,6 +951,23 @@ codelet implementation to be executed when executing \p task.
 
				 Return the codelet implementation to be executed
			
 
				 when executing \p task.
			
 
				 
			
 
				+\fn void starpu_iteration_push(unsigned long iteration)
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+Sets the iteration number for all the tasks to be submitted after
			
 
				+this call. This is typically called at the beginning of a task
			
 
				+submission loop. This number will then show up in tracing tools. A
			
 
				+corresponding starpu_iteration_pop() call must be made to match the call to
			
 
				+starpu_iteration_push(), at the end of the same task submission loop, typically.
			
 
				+
			
 
				+Nested calls to starpu_iteration_push and starpu_iteration_pop are allowed, to
			
 
				+describe a loop nest for instance, provided that they match properly.
			
 
				+
			
 
				+\fn void starpu_iteration_pop(void)
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+Drops the iteration number for submitted tasks. This must match a previous
			
 
				+call to starpu_iteration_push(), and is typically called at the end of a task
			
 
				+submission loop.
			
 
				+
			
 
				 \fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Create (and submit) an empty task that unlocks a tag once all its dependencies are fulfilled.
			
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -155,7 +155,7 @@ buffer that was initially registered.
 
				 
			
 
				 \fn void starpu_data_unregister_submit(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-Destroy the data \p handle once it is not needed anymore by any
			
 
				+Destroy the data \p handle once it is no longer needed by any
			
 
				 submitted task. No coherency is assumed.
			
 
				 
			
 
				 \fn void starpu_data_invalidate(starpu_data_handle_t handle)
			
@@ -180,6 +180,22 @@ modified, it is automatically transfered into those memory nodes. For
 
				 instance a <c>1<<0</c> write-through mask means that the CUDA workers
			
 
				 will commit their changes in main memory (node 0).
			
 
				 
			
 
				+\fn void starpu_data_set_name(starpu_data_handle_t handle, const char *name)
			
 
				+\ingroup API_Data_Management
			
 
				+Set the name of the data, to be shown in various profiling tools.
			
 
				+
			
 
				+\fn void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[])
			
 
				+\ingroup API_Data_Management
			
 
				+Set the coordinates of the data, to be shown in various profiling tools.
			
 
				+\p dimensions is the size of the \p dims array
			
 
				+This can be for instance the tile coordinates within a big matrix.
			
 
				+
			
 
				+\fn void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...)
			
 
				+\ingroup API_Data_Management
			
 
				+Set the coordinates of the data, to be shown in various profiling tools.
			
 
				+\p dimensions is the number of subsequent \c int parameters.
			
 
				+This can be for instance the tile coordinates within a big matrix.
			
 
				+
			
 
				 \fn int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 \ingroup API_Data_Management
			
 
				 Issue a fetch request for the data \p handle to \p node, i.e.
			
@@ -264,8 +280,8 @@ the application will get an up-to-date copy of \p handle in main memory
 
				 located where the data was originally registered, and that all
			
 
				 concurrent accesses (e.g. from tasks) will be consistent with the
			
 
				 access mode specified with \p mode. starpu_data_release() must
			
 
				-be called once the application does not need to access the piece of
			
 
				-data anymore. Note that implicit data dependencies are also enforced
			
 
				+be called once the application no longer needs to access the piece of
			
 
				+data. Note that implicit data dependencies are also enforced
			
 
				 by starpu_data_acquire(), i.e. starpu_data_acquire() will wait for all
			
 
				 tasks scheduled to work on the data, unless they have been disabled
			
 
				 explictly by calling starpu_data_set_default_sequential_consistency_flag() or
			
@@ -280,8 +296,8 @@ Asynchronous equivalent of starpu_data_acquire(). When the data
 
				 specified in \p handle is available in the access \p mode, the \p
			
 
				 callback function is executed. The application may access
			
 
				 the requested data during the execution of \p callback. The \p callback
			
 
				-function must call starpu_data_release() once the application does not
			
 
				-need to access the piece of data anymore. Note that implicit data
			
 
				+function must call starpu_data_release() once the application no longer
			
 
				+needs to access the piece of data. Note that implicit data
			
 
				 dependencies are also enforced by starpu_data_acquire_cb() in case they
			
 
				 are not disabled. Contrary to starpu_data_acquire(), this function is
			
 
				 non-blocking and may be called from task callbacks. Upon successful
			
@@ -293,14 +309,27 @@ Equivalent of starpu_data_acquire_cb() with the possibility of enabling or disab
 
				 When the data specified in \p handle is available in the access
			
 
				 \p mode, the \p callback function is executed. The application may access
			
 
				 the requested data during the execution of this \p callback. The \p callback
			
 
				-function must call starpu_data_release() once the application does not
			
 
				-need to access the piece of data anymore. Note that implicit data
			
 
				+function must call starpu_data_release() once the application no longer
			
 
				+needs to access the piece of data. Note that implicit data
			
 
				 dependencies are also enforced by starpu_data_acquire_cb_sequential_consistency() in case they
			
 
				 are not disabled specifically for the given \p handle or by the parameter \p sequential_consistency.
			
 
				 Similarly to starpu_data_acquire_cb(), this function is
			
 
				 non-blocking and may be called from task callbacks. Upon successful
			
 
				 completion, this function returns 0.
			
 
				 
			
 
				+\fn int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				+\ingroup API_Data_Management
			
 
				+The application can call this function instead of starpu_data_acquire() so as to
			
 
				+acquire the data like starpu_data_acquire(), but only if all
			
 
				+previously-submitted tasks have completed, in which case starpu_data_acquire_try()
			
 
				+returns 0. StarPU will have ensured that the application will get an up-to-date
			
 
				+copy of \p handle in main memory located where the data was originally
			
 
				+registered. starpu_data_release() must be called once the application no longer
			
 
				+needs to access the piece of data.
			
 
				+
			
 
				+If not all previously-submitted tasks have completed, starpu_data_acquire_try
			
 
				+returns -EAGAIN, and starpu_data_release() must not be called.
			
 
				+
			
 
				 \def STARPU_ACQUIRE_NO_NODE
			
 
				 \ingroup API_Data_Management
			
 
				 This macro can be used to acquire data, but not require it to be available on a given node, only enforce R/W dependencies.
			
@@ -335,6 +364,22 @@ memory.
 
				 ::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
			
 
				 explicit node number.
			
 
				 
			
 
				+\fn int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency, long *pre_sync_jobid, long *post_sync_jobid)
			
 
				+\ingroup API_Data_Management
			
 
				+This is the same as starpu_data_acquire_on_node_cb_sequential_consistency(),
			
 
				+except that the \e pre_sync_jobid and \e post_sync_jobid parameters can be used
			
 
				+to retrieve the jobid of the synchronization tasks. \e pre_sync_jobid happens
			
 
				+just before the acquisition, and \e post_sync_jobid happens just after the
			
 
				+release.
			
 
				+
			
 
				+\fn int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
			
 
				+\ingroup API_Data_Management
			
 
				+This is the same as starpu_data_acquire_try(), except that the
			
 
				+data will be available on the given memory node instead of main
			
 
				+memory.
			
 
				+::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
			
 
				+explicit node number.
			
 
				+
			
 
				 \def STARPU_DATA_ACQUIRE_CB(handle, mode, code)
			
 
				 \ingroup API_Data_Management
			
 
				 STARPU_DATA_ACQUIRE_CB() is the same as starpu_data_acquire_cb(),
			
--- a/doc/doxygen/chapters/api/modularized_scheduler.doxy
+++ b/doc/doxygen/chapters/api/modularized_scheduler.doxy
@@ -122,8 +122,6 @@ The actual scheduler
 
				 	this lock is used to protect the scheduler, it is taken in
			
 
				 	read mode pushing a task and in write mode for adding or
			
 
				 	removing workers
			
 
				-\var struct starpu_sched_component *starpu_sched_tree::worker_components[STARPU_NMAXWORKERS]
			
 
				-     	worker components
			
 
				 
			
 
				 @name Scheduling Tree API
			
 
				 \ingroup API_Modularized_Scheduler
			
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -106,6 +106,11 @@ condition variable. For instance, in the case of a scheduling strategy
 
				 with a single task queue, the same condition variable would be used to
			
 
				 block and wake up all workers.
			
 
				 
			
 
				+\fn void starpu_worker_get_job_id(struct starpu_task *task)
			
 
				+\ingroup API_Scheduling_Policy
			
 
				+Return the job id of the given task, i.e. a number that uniquely identifies this
			
 
				+task for the local MPI node, and can be found in the various offline execution
			
 
				+traces reports.
			
 
				 
			
 
				 \fn int starpu_sched_set_min_priority(int min_prio)
			
 
				 \ingroup API_Scheduling_Policy
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -179,17 +179,16 @@ LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
 
				 endif
			
 
				 
			
 
				 if STARPU_USE_MPI_MASTER_SLAVE
			
 
				-MPI 			= $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				-LOADER_BIN2		= $(MPI) $(LOADER_BIN)
			
 
				+LOADER_BIN2		= $(MPI_LAUNCHER) $(LOADER_BIN)
			
 
				 else
			
 
				 LOADER_BIN2		= $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+TESTS_ENVIRONMENT	=	$(MPI_RUN_ARGS) top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				 LOG_COMPILER		=	$(LOADER_BIN2)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN2)
			
 
				+TESTS_ENVIRONMENT	=	$(MPI_RUN_ARGS) top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN2)
			
 
				 endif
			
 
				 
			
 
				 endif
			
@@ -812,11 +811,6 @@ cg_cg_SOURCES =					\
 
				 	cg/cg_kernels.c				\
			
 
				 	common/blas.c
			
 
				 
			
 
				-if STARPU_USE_CUDA
			
 
				-cg_cg_SOURCES +=				\
			
 
				-	cg/cg_dot_kernel.cu
			
 
				-endif
			
 
				-
			
 
				 cg_cg_LDADD =					\
			
 
				 	$(STARPU_BLAS_LDFLAGS)
			
 
				 
			
--- a/examples/basic_examples/multiformat.c
+++ b/examples/basic_examples/multiformat.c
@@ -274,7 +274,13 @@ main(void)
 
				 #ifdef STARPU_USE_CPU
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = starpu_init(NULL);
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+
			
 
				+	/* this example doesn't support MPI Master-Slave */
			
 
				+	conf.nmpi_ms = 0;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	ncpu = starpu_cpu_worker_get_count();
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2014-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -308,6 +308,8 @@ static int cg(void)
 
				 		double delta_old;
			
 
				 		double alpha, beta;
			
 
				 
			
 
				+		starpu_iteration_push(i);
			
 
				+
			
 
				 		/* q <- A d */
			
 
				 		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
			
 
				 
			
@@ -356,6 +358,7 @@ static int cg(void)
 
				 			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				 		}
			
 
				 
			
 
				+		starpu_iteration_pop();
			
 
				 		i++;
			
 
				 	}
			
 
				 
			
--- a/examples/cg/cg_dot_kernel.cu
+++ b/examples/cg/cg_dot_kernel.cu
@@ -1,156 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2015  CNRS
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#include "cg.h"
			
 
				-
			
 
				-#define MAXNBLOCKS	128
			
 
				-#define MAXTHREADSPERBLOCK	256
			
 
				-
			
 
				-/*
			
 
				- * Dot product kernel
			
 
				- * We first perform dot computation in parallel in dot_device, and then we
			
 
				- * gather the dot values into one in gather_dot_device.
			
 
				- */
			
 
				-
			
 
				-static __global__ void dot_device(TYPE *vx, TYPE *vy, unsigned n, TYPE *dot_array)
			
 
				-{
			
 
				-	__shared__ TYPE scnt[MAXTHREADSPERBLOCK];
			
 
				-
			
 
				-	/* Do we have a successful shot ? */
			
 
				-	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				-
			
 
				-	const int nthreads = gridDim.x * blockDim.x;
			
 
				-
			
 
				-	/* Blank the shared mem buffer */
			
 
				-	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				-		scnt[threadIdx.x] = (TYPE)0.0;
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	int ind;
			
 
				-	for (ind = tid; ind < n; ind += nthreads)
			
 
				-	{
			
 
				-		TYPE x = vx[ind];
			
 
				-		TYPE y = vy[ind];
			
 
				-
			
 
				-		scnt[threadIdx.x] += (x*y);
			
 
				-	}
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				-
			
 
				-	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				-	unsigned s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			scnt[threadIdx.x] += scnt[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-	/* report the number of successful shots in the block */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		dot_array[blockIdx.x] = scnt[0];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-}
			
 
				-
			
 
				-static __global__ void gather_dot_device(TYPE *dot_array, TYPE *dot)
			
 
				-{
			
 
				-	__shared__ TYPE accumulator[MAXNBLOCKS];
			
 
				-
			
 
				-	unsigned i;
			
 
				-
			
 
				-	/* Load the values from global mem */
			
 
				-	for (i = 0; i < blockDim.x; i++)
			
 
				-		accumulator[i] = dot_array[i];
			
 
				-
			
 
				-	__syncthreads();
			
 
				-
			
 
				-	/* Perform a reduction in shared memory */
			
 
				-	unsigned s;
			
 
				-	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				-	{
			
 
				-		if (threadIdx.x < s)
			
 
				-			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				-
			
 
				-		__syncthreads();
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	/* Save the result in global memory */
			
 
				-	if (threadIdx.x == 0)
			
 
				-		*dot = *dot + accumulator[0];
			
 
				-}
			
 
				-
			
 
				-extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
			
 
				-{
			
 
				-	/* How many blocks do we use ? */ 
			
 
				-	unsigned nblocks = 128; // TODO
			
 
				-	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				-	
			
 
				-	TYPE *per_block_sum;
			
 
				-	cudaMalloc((void **)&per_block_sum, nblocks*sizeof(TYPE));
			
 
				-
			
 
				-	STARPU_ASSERT((nelems % nblocks) == 0);
			
 
				-
			
 
				-	/* How many threads per block ? At most 256, but no more threads than
			
 
				-	 * there are entries to process per block. */
			
 
				-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
			
 
				-
			
 
				-	/* each entry of per_block_sum contains the number of successful shots
			
 
				-	 * in the corresponding block. */
			
 
				-	dot_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, nelems, per_block_sum);
			
 
				-
			
 
				-	/* Note that we do not synchronize between kernel calls because there
			
 
				-	 * is an implicit serialization */
			
 
				-	gather_dot_device<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_sum, dot);
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	if (cures)
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cudaFree(per_block_sum);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Fill a vector with zeroes
			
 
				- */
			
 
				-
			
 
				-static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
			
 
				-{
			
 
				-	unsigned i;
			
 
				-	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				-
			
 
				-	for (i = first_i; i < nelems; i += nelems_per_thread)
			
 
				-		x[i] = 0.0;
			
 
				-}
			
 
				-
			
 
				-extern "C" void zero_vector(TYPE *x, unsigned nelems)
			
 
				-{
			
 
				-	unsigned nblocks = STARPU_MIN(128, nelems);
			
 
				-	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
			
 
				-
			
 
				-	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
			
 
				-
			
 
				-	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
			
 
				-}
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -173,8 +173,9 @@ extern void zero_vector(TYPE *x, unsigned nelems);
 
				 static void bzero_variable_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	size_t size = STARPU_VARIABLE_GET_ELEMSIZE(descr[0]);
			
 
				 
			
 
				-	zero_vector(v, 1);
			
 
				+	cudaMemsetAsync(v, 0, size, starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -209,8 +210,9 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 
				 {
			
 
				 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				- 
			
 
				-	zero_vector(v, n);
			
 
				+	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				+
			
 
				+	cudaMemsetAsync(v, 0, n * elemsize, starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -247,8 +249,6 @@ struct starpu_codelet bzero_vector_cl =
 
				  */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-extern void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot);
			
 
				-
			
 
				 static void dot_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]); 
			
@@ -257,26 +257,13 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
			
 
				 
			
 
				-	int version;
			
 
				-	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
			
 
				-
			
 
				-	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
			
 
				-	if (version < 99999)
			
 
				-	{
			
 
				-		/* This function puts its result directly in device memory, so
			
 
				-		 * that we don't have to transfer that value back and forth. */
			
 
				-		dot_host(v1, v2, n, dot);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* Should be able to put result in GPU, but does not yet, see
			
 
				-		 * Nvidia bug #1882017 */
			
 
				-		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
			
 
				-			n, v1, 1, v2, 1, dot);
			
 
				-		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				-			STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-	}
			
 
				+	cublasHandle_t handle = starpu_cublas_get_local_handle();
			
 
				+	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
			
 
				+	cublasStatus_t status = cublasdot(handle,
			
 
				+		n, v1, 1, v2, 1, dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -310,6 +297,7 @@ static struct starpu_codelet dot_kernel_cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dot_kernel_cuda},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &dot_kernel_model
			
 
				 };
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2017  CNRS
			
 
				  *
			
@@ -190,6 +190,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	for (k = 0; k < nbigblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
			
 
				 		/* we defer the launch of the first task */
			
 
				 		if (k == 0)
			
@@ -217,6 +218,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -59,10 +59,11 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				 		int ret;
			
 
				+		starpu_iteration_push(k);
			
 
				                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				                 ret = starpu_task_insert(&cl11,
			
 
				-					 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k : STARPU_MAX_PRIO,
			
 
				+					 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 					 STARPU_RW, sdatakk,
			
 
				 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
@@ -76,7 +77,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				                         ret = starpu_task_insert(&cl21,
			
 
				-						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k - j : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 						 STARPU_R, sdatakk,
			
 
				 						 STARPU_RW, sdatakj,
			
 
				 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
@@ -98,7 +99,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				 
			
 
				 					ret = starpu_task_insert(&cl22,
			
 
				-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k - j - i : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 								 STARPU_R, sdataki,
			
 
				 								 STARPU_R, sdatakj,
			
 
				 								 cl22.modes[2], sdataij,
			
@@ -111,6 +112,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 			}
			
 
				 			starpu_data_wont_use(sdatakj);
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
@@ -161,6 +163,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				 	starpu_data_handle_t dataA;
			
 
				+	unsigned x, y;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
@@ -180,6 +183,13 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+		for (y = 0; y < nblocks; y++)
			
 
				+		{
			
 
				+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
			
 
				+			starpu_data_set_coordinates(data, 2, x, y);
			
 
				+		}
			
 
				+
			
 
				 	int ret = _cholesky(dataA, nblocks);
			
 
				 
			
 
				 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -35,7 +35,7 @@
 
				 #endif
			
 
				 
			
 
				 /*
			
 
				- *   U22 
			
 
				+ *   U22
			
 
				  */
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA)
			
@@ -65,7 +65,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 		if (worker_size == 1)
			
 
				 		{
			
 
				 			/* Sequential CPU kernel */
			
 
				-			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 		}
			
 
				 		else
			
@@ -75,11 +75,11 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 
			
 
				 			unsigned block_size = (dx + worker_size - 1)/worker_size;
			
 
				 			unsigned new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
			
 
				-			
			
 
				+
			
 
				 			float *new_left = &left[block_size*rank];
			
 
				 			float *new_center = &center[block_size*rank];
			
 
				 
			
 
				-			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				+			STARPU_SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21,
			
 
				 				right, ld12, 1.0f, new_center, ld22);
			
 
				 		}
			
 
				 	}
			
@@ -88,8 +88,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 		/* CUDA kernel */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
			
 
				-				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
			
 
				-				&m1, left, ld21, right, ld12, 
			
 
				+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz,
			
 
				+				&m1, left, ld21, right, ld12,
			
 
				 				&p1, center, ld22);
			
 
				 		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 			STARPU_CUBLAS_REPORT_ERROR(status);
			
@@ -110,7 +110,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
				 }
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				-/* 
			
 
				+/*
			
 
				  * U21
			
 
				  */
			
 
				 
			
@@ -163,18 +163,18 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 
				 {
			
 
				 	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				 }
			
 
				-#endif 
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  *	U11
			
 
				  */
			
 
				 
			
 
				-static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
			
 
				+static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 /*	printf("11\n"); */
			
 
				 	float *sub11;
			
 
				 
			
 
				-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				 
			
 
				 	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				 	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
			
@@ -201,10 +201,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 				sub11[z+z*ld] = lambda11;
			
 
				 
			
 
				 				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				-		
			
 
				+
			
 
				 				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				-		
			
 
				-				STARPU_SSYR("L", nx - z - 1, -1.0f, 
			
 
				+
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f,
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
@@ -247,12 +247,11 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				-				
			
 
				 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
			
 
				 				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				STARPU_ASSERT(*lambda11 != 0.0f);
			
 
				-				
			
 
				+
			
 
				 				*lambda11 = sqrt(*lambda11);
			
 
				 
			
 
				 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
			
@@ -260,13 +259,17 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 				float scal = 1.0f/(*lambda11);
			
 
				 
			
 
				 				status = cublasSscal(handle,
			
 
				-						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
			
 
				+						     nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
 
				 				status = cublasSsyr(handle,
			
 
				-							CUBLAS_FILL_MODE_UPPER,
			
 
				-							nx - z - 1, &m1,
			
 
				-							&sub11[(z+1)+z*ld], 1,
			
 
				-							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+						    CUBLAS_FILL_MODE_UPPER,
			
 
				+						    nx - z - 1, &m1,
			
 
				+						    &sub11[(z+1)+z*ld], 1,
			
 
				+						    &sub11[(z+1)+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 
			
 
				 			cudaStreamSynchronize(stream);
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
			
 
				  *
			
@@ -171,6 +171,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				 		struct starpu_task *task = create_task_11(dataA, k);
			
 
				 		/* we defer the launch of the first task */
			
 
				 		if (k == 0)
			
@@ -198,6 +199,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					create_task_22(dataA, k, i, j);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -167,6 +167,7 @@ static int cholesky_no_stride(void)
 
				 
			
 
				 	for (k = 0; k < nblocks_p; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				 		struct starpu_task *task = create_task_11(k, nblocks_p);
			
 
				 		/* we defer the launch of the first task */
			
 
				 		if (k == 0)
			
@@ -193,6 +194,7 @@ static int cholesky_no_stride(void)
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
@@ -291,6 +293,7 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x],
			
 
				 						    BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
			
 
				+			starpu_data_set_coordinates(A_state[y][x], 2, x, y);
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/examples/cpp/add_vectors.cpp
+++ b/examples/cpp/add_vectors.cpp
@@ -61,8 +61,14 @@ int main(int argc, char **argv)
 
				 	std::vector<char> vec_B(VEC_SIZE, 3); // all the vector is initialized to 3
			
 
				 	std::vector<char> vec_C(VEC_SIZE, 0); // all the vector is initialized to 0
			
 
				 
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.nmic = 0;
			
 
				+	conf.nscc = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				+
			
 
				 	// initialize StarPU with default configuration
			
 
				-	int ret = starpu_init(NULL);
			
 
				+	int ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
--- a/examples/cpp/add_vectors_cpp11.cpp
+++ b/examples/cpp/add_vectors_cpp11.cpp
@@ -67,8 +67,14 @@ int main(int argc, char **argv)
 
				 	std::vector<char> vec_B(vec_size, 3); // all the vector is initialized to 3
			
 
				 	std::vector<char> vec_C(vec_size, 0); // all the vector is initialized to 0
			
 
				 
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.nmic = 0;
			
 
				+	conf.nscc = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				+
			
 
				 	// initialize StarPU with default configuration
			
 
				-	auto ret = starpu_init(NULL);
			
 
				+	auto ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -51,7 +51,13 @@ int main(int argc, char **argv)
 
				 	unsigned i;
			
 
				 	unsigned niter = 50;
			
 
				 
			
 
				-	ret = starpu_init(NULL);
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.nmic = 0;
			
 
				+	conf.nscc = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				 	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2012, 2014-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -366,13 +366,17 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 				float scal = 1.0f/pivot;
			
 
				 
			
 
				 				status = cublasSscal(starpu_cublas_get_local_handle(),
			
 
				-						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
			
 
				+						     nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
 
				 				status = cublasSger(starpu_cublas_get_local_handle(),
			
 
				-						nx - z - 1, nx - z - 1, &m1,
			
 
				-								&sub11[z+(z+1)*ld], ld,
			
 
				-								&sub11[(z+1)+z*ld], 1,
			
 
				-								&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+						    nx - z - 1, nx - z - 1, &m1,
			
 
				+						    &sub11[z+(z+1)*ld], ld,
			
 
				+						    &sub11[(z+1)+z*ld], 1,
			
 
				+						    &sub11[(z+1) + (z+1)*ld],ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 
			
 
				 			cudaStreamSynchronize(stream);
			
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  CNRS
			
 
				+ * Copyright (C) 2010, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,7 +23,7 @@
 
				 
			
 
				 /*
			
 
				  *	Algorithm :
			
 
				- *		
			
 
				+ *
			
 
				  *		i = 0
			
 
				  *		r = b - A x
			
 
				  *			( d = A x ; r = r - d )
			
@@ -31,14 +31,14 @@
 
				  *		delta_new = trans(r) r
			
 
				  *		delta_0 = delta_new
			
 
				  *
			
 
				- * 		while (i < i_max && delta_new > eps^2 delta_0) 
			
 
				+ * 		while (i < i_max && delta_new > eps^2 delta_0)
			
 
				  * 		{
			
 
				  *			q = A d
			
 
				  *			alpha = delta_new / ( trans(d) q )
			
 
				  *			x = x + alpha d
			
 
				  *			if ( i is divisible by 50 )
			
 
				  *				r = b - A x
			
 
				- *			else 
			
 
				+ *			else
			
 
				  *				r = r - alpha q
			
 
				  *			delta_old = delta_new
			
 
				  *			delta_new = trans(r) r
			
@@ -125,7 +125,7 @@ void cpu_codelet_func_3(void *descr[], void *arg)
 
				 	float dot;
			
 
				 	float *vec;
			
 
				 	int size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = (int)STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -145,7 +145,7 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 
				 	float dot;
			
 
				 	float *vec;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -204,7 +204,7 @@ void cpu_codelet_func_4(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 
			
 
				 }
			
 
				 
			
 
				-/* 
			
 
				+/*
			
 
				  *	compute alpha = delta_new / ( trans(d) q )
			
 
				  *
			
 
				  * 		descr[0] = d, descr[1] = q
			
@@ -217,7 +217,7 @@ void cpu_codelet_func_5(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecd, *vecq;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -237,7 +237,7 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecd, *vecq;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -268,7 +268,7 @@ void cpu_codelet_func_6(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecx, *vecd;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -284,7 +284,7 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecx, *vecd;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -309,7 +309,7 @@ void cpu_codelet_func_7(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecr, *vecq;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -325,7 +325,7 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecr, *vecq;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -355,7 +355,7 @@ void cpu_codelet_func_8(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecr;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -374,12 +374,13 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecr;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
			
 
				+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->delta_old = pb->delta_new;
			
@@ -401,7 +402,7 @@ void cpu_codelet_func_9(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecd, *vecr;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -421,7 +422,7 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
				 	struct cg_problem *pb = arg;
			
 
				 	float *vecd, *vecr;
			
 
				 	uint32_t size;
			
 
				-	
			
 
				+
			
 
				 	/* get the vector */
			
 
				 	vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
--- a/examples/lu/xlu.c
+++ b/examples/lu/xlu.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010-2011, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  CNRS
			
 
				  *
			
@@ -184,6 +184,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				 		struct starpu_task *task = create_task_11(dataA, k);
			
 
				 
			
 
				 		/* we defer the launch of the first task */
			
@@ -214,6 +215,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
				 			     if (ret == -ENODEV) return ret;
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
--- a/examples/lu/xlu_implicit.c
+++ b/examples/lu/xlu_implicit.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2011, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2015, 2016  CNRS
			
 
				  *
			
@@ -127,6 +127,8 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	{
			
 
				 		int ret;
			
 
				 
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				 		ret = create_task_11(dataA, k);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				 
			
@@ -150,6 +152,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
				 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
			
 
				 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* stall the application until the end of computations */
			
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
			
 
				  *
			
@@ -173,6 +173,8 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
				 	{
			
 
				 		int ret;
			
 
				 
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				 		ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				 
			
@@ -205,6 +207,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
				 		    starpu_data_wont_use(get_block(dataAp, nblocks, k, i));
			
 
				 		    starpu_data_wont_use(get_block(dataAp, nblocks, i, k));
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* stall the application until the end of computations */
			
--- a/examples/lu/xlu_pivot.c
+++ b/examples/lu/xlu_pivot.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2012, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -247,6 +247,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				 		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
			
 
				 
			
 
				 		/* we defer the launch of the first task */
			
@@ -258,7 +259,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
				 		{
			
 
				 			ret = starpu_task_submit(task);
			
 
				 			if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-			return ret;
			
 
				+			if (ret == -ENODEV) return ret;
			
 
				 		}
			
 
				 
			
 
				 		for (i = 0; i < nblocks; i++)
			
@@ -286,6 +287,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
				 			     if (ret == -ENODEV) return ret;
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* we wait the last task (TAG11(nblocks - 1)) and all the pivot tasks */
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -544,6 +544,8 @@ int main(int argc, char **argv)
 
				 		 * parallel task. */
			
 
				 		int per_block_cnt[nblocks_p];
			
 
				 
			
 
				+		starpu_iteration_push(niter_p);
			
 
				+
			
 
				 		for (iby = 0; iby < nblocks_p; iby++)
			
 
				 		{
			
 
				 			per_block_cnt[iby] = 0;
			
@@ -577,6 +579,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 
			
 
				 
			
 
				+		starpu_iteration_pop();
			
 
				 		if (demo_p)
			
 
				 		{
			
 
				 			/* Zoom in */
			
--- a/examples/mlr/mlr.c
+++ b/examples/mlr/mlr.c
@@ -33,7 +33,7 @@
 
				    where M, N, K are the parameters of the task, exponents are coming
			
 
				    from model->combinations[..][..]  and finally a, b, c are
			
 
				    coefficients which mostly depend on the machine speed.
			
 
				-   
			
 
				+
			
 
				    These coefficients are going to be automatically computed using
			
 
				    least square method.
			
 
				 
			
@@ -65,7 +65,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 			     	  &m,
			
 
				      			     	  &n,
			
 
				      			     	  &k);
			
 
				-	
			
 
				+
			
 
				 	for(i=0; i < (long) (m*m*n); i++)
			
 
				 		sum+=i;
			
 
				 
			
@@ -85,7 +85,8 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 static const char * parameters_names[]	= {	"M",	"N",	"K", };
			
 
				 
			
 
				-static struct starpu_perfmodel cl_model_init = {
			
 
				+static struct starpu_perfmodel cl_model_init =
			
 
				+{
			
 
				 	.type = STARPU_MULTIPLE_REGRESSION_BASED,
			
 
				 	.symbol = "mlr_init",
			
 
				 	.parameters = cl_params,
			
@@ -104,7 +105,8 @@ static unsigned combi2 [3]		= {	0,	3,	1 };
 
				 
			
 
				 static unsigned *combinations[] = { combi1, combi2 };
			
 
				 
			
 
				-static struct starpu_perfmodel cl_model_final = {
			
 
				+static struct starpu_perfmodel cl_model_final =
			
 
				+{
			
 
				 	.type = STARPU_MULTIPLE_REGRESSION_BASED,
			
 
				 	.symbol = "mlr_final",
			
 
				 	.parameters = cl_params,
			
@@ -117,14 +119,16 @@ static struct starpu_perfmodel cl_model_final = {
 
				 /* End of the part specific to multiple linear regression perfmodels */
			
 
				 /* ############################################ */
			
 
				 
			
 
				-static struct starpu_codelet cl_init = {
			
 
				+static struct starpu_codelet cl_init =
			
 
				+{
			
 
				 	.cpu_funcs = { cpu_func },
			
 
				 	.cpu_funcs_name = { "cpu_func" },
			
 
				 	.nbuffers = 0,
			
 
				 	.model = &cl_model_init,
			
 
				 };
			
 
				 
			
 
				-static struct starpu_codelet cl_final = {
			
 
				+static struct starpu_codelet cl_final =
			
 
				+{
			
 
				 	.cpu_funcs = { cpu_func },
			
 
				 	.cpu_funcs_name = { "cpu_func" },
			
 
				 	.nbuffers = 0,
			
@@ -140,7 +144,7 @@ int main(int argc, char **argv)
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				-	
			
 
				+
			
 
				 	sum=0;
			
 
				 	int m,n,k;
			
 
				 
			
@@ -150,7 +154,7 @@ int main(int argc, char **argv)
 
				 		m = (int) ((rand() % 10)+1);
			
 
				 		n = (int) ((rand() % 10)+1);
			
 
				 		k = (int) ((rand() % 10)+1);
			
 
				-		
			
 
				+
			
 
				 		for(j=0; j < 42; j++)
			
 
				 		{
			
 
				 			starpu_insert_task(&cl_init,
			
@@ -165,7 +169,7 @@ int main(int argc, char **argv)
 
				 				   0);
			
 
				 		}
			
 
				 	}
			
 
				-			  
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/native_fortran/nf_sched_ctx.f90
+++ b/examples/native_fortran/nf_sched_ctx.f90
@@ -96,6 +96,7 @@ program nf_sched_ctx
 
				                 procs2(i) = procs(nprocs1+i)
			
 
				                 write(*,*) i, procs2(i)
			
 
				         end do
			
 
				+        deallocate(procs)
			
 
				 
			
 
				         ! create sched context 1 with default policy, by giving a NULL policy name
			
 
				         ctx1 = fstarpu_sched_ctx_create(procs1, nprocs1,  &
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  *
			
@@ -43,8 +43,8 @@ void parse_args(int argc, char **argv)
 
				 {
			
 
				 	if (argc == 3)
			
 
				 	{
			
 
				-		strncpy(filename_in, argv[1], 1024);
			
 
				-		strncpy(filename_out, argv[2], 1024);
			
 
				+		strncpy(filename_in, argv[1], 1023);
			
 
				+		strncpy(filename_out, argv[2], 1023);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -218,6 +218,7 @@ int main(int argc, char **argv)
 
				 	/* do the computation */
			
 
				 	for (frame = 0; frame < nframes; frame++)
			
 
				 	{
			
 
				+		starpu_iteration_push(frame);
			
 
				 		unsigned blocky;
			
 
				 		for (blocky = 0; blocky < nblocks_y; blocky++)
			
 
				 		{
			
@@ -265,6 +266,7 @@ int main(int argc, char **argv)
 
				 			ret = starpu_task_submit(task);
			
 
				 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	/* make sure all output buffers are sync'ed */
			
--- a/examples/sched_ctx/axpy_partition_gpu.h
+++ b/examples/sched_ctx/axpy_partition_gpu.h
@@ -22,7 +22,8 @@
 
				 #pragma once
			
 
				 
			
 
				 
			
 
				-__device__ static uint get_smid(void) {
			
 
				+__device__ static uint get_smid(void)
			
 
				+{
			
 
				 #if defined(__CUDACC__)
			
 
				   uint ret;
			
 
				   asm("mov.u32 %0, %smid;" : "=r"(ret) );
			
@@ -50,7 +51,7 @@ __syncthreads();							\
 
				 if(block_start > active_blocks)						\
			
 
				   {									\
			
 
				     return;								\
			
 
				-  }									
			
 
				+  }
			
 
				 
			
 
				 #define __P_LOOPXY							\
			
 
				   dim3 blockid;								\
			
--- a/examples/sched_ctx/parallel_code.c
+++ b/examples/sched_ctx/parallel_code.c
@@ -79,6 +79,7 @@ int main(int argc, char **argv)
 
				 #else
			
 
				 	nprocs1 = 1;
			
 
				 	procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				+	procs1[0] = 0;
			
 
				 #endif
			
 
				 
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
			
--- a/examples/scheduler/heteroprio_test.c
+++ b/examples/scheduler/heteroprio_test.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2015  INRIA
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -173,11 +173,13 @@ int main(int argc, char** argv)
 
				 	memset(handles, 0, sizeof(handles[0])*nbHandles);
			
 
				 	int dataA[nbHandles];
			
 
				 	int idx;
			
 
				-	for(idx = 0; idx < nbHandles; ++idx){
			
 
				+	for(idx = 0; idx < nbHandles; ++idx)
			
 
				+	{
			
 
				 		dataA[idx] = idx;
			
 
				 	}
			
 
				 	int idxHandle;
			
 
				-	for(idxHandle = 0; idxHandle < nbHandles; ++idxHandle){
			
 
				+	for(idxHandle = 0; idxHandle < nbHandles; ++idxHandle)
			
 
				+	{
			
 
				 		starpu_variable_data_register(&handles[idxHandle], 0, (uintptr_t)&dataA[idxHandle], sizeof(dataA[idxHandle]));
			
 
				 	}
			
 
				 
			
--- a/examples/spmv/dw_block_spmv_kernels.c
+++ b/examples/spmv/dw_block_spmv_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011, 2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011  CNRS
			
 
				+ * Copyright (C) 2009, 2010-2011, 2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -49,11 +49,13 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+		{
			
 
				 			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
			
 
				 					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			break;
			
 
				+		}
			
 
				 #endif
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -90,10 +90,10 @@ MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+TESTS_ENVIRONMENT	=	$(MPI_RUN_ARGS) top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				 LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	$(MPI_RUN_ARGS) top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 endif
			
--- a/examples/stencil/implicit-stencil-blocks.c
+++ b/examples/stencil/implicit-stencil-blocks.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2013-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2013-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -247,7 +247,7 @@ void assign_blocks_to_mpi_nodes(int world_size)
 
				 
			
 
				 static size_t allocated = 0;
			
 
				 
			
 
				-static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void allocate_block_on_node(starpu_data_handle_t *handleptr, unsigned bz, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				 	int ret;
			
 
				 	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
@@ -270,6 +270,8 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 
				 
			
 
				 	/* Register it to StarPU */
			
 
				 	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
			
 
				+
			
 
				+	starpu_data_set_coordinates(*handleptr, 1, bz);
			
 
				 }
			
 
				 
			
 
				 static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
			
@@ -299,21 +301,21 @@ void allocate_memory_on_node(int rank)
 
				 		if (node == rank)
			
 
				 		{
			
 
				 			/* Main blocks */
			
 
				-			allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
			
 
				+			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				-			allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
			
 
				+			allocate_block_on_node(&block->layers_handle[1], bz, &block->layers[1],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				 
			
 
				 			/* Boundary blocks : Top */
			
 
				-			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][0], bz, &block->boundaries[T][0],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), K);
			
 
				-			allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][1], bz, &block->boundaries[T][1],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), K);
			
 
				 
			
 
				 			/* Boundary blocks : Bottom */
			
 
				-			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][0], bz, &block->boundaries[B][0],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), K);
			
 
				-			allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][1], bz, &block->boundaries[B][1],
			
 
				 					       (sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 		/* Register void blocks to StarPU, that StarPU-MPI will request to
			
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2013-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2013-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -248,7 +248,7 @@ void assign_blocks_to_mpi_nodes(int world_size)
 
				 
			
 
				 static size_t allocated = 0;
			
 
				 
			
 
				-static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				+static void allocate_block_on_node(starpu_data_handle_t *handleptr, unsigned bz, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
			
 
				 {
			
 
				 	int ret;
			
 
				 	size_t block_size = nx*ny*nz*sizeof(TYPE);
			
@@ -271,6 +271,8 @@ static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr,
 
				 
			
 
				 	/* Register it to StarPU */
			
 
				 	starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
			
 
				+
			
 
				+	starpu_data_set_coordinates(*handleptr, 1, bz);
			
 
				 }
			
 
				 
			
 
				 static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
			
@@ -300,7 +302,7 @@ void allocate_memory_on_node(int rank)
 
				 		/* Main blocks */
			
 
				 		if (node == rank)
			
 
				 		{
			
 
				-			allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
			
 
				+			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
			
 
				 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 #ifdef LIFE
			
@@ -314,7 +316,7 @@ void allocate_memory_on_node(int rank)
 
				 /*			printf("block %d starts with %d/%d alive\n", bz, sum, sizex*sizey*size_bz);*/
			
 
				 #endif
			
 
				 #endif
			
 
				-			allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
			
 
				+			allocate_block_on_node(&block->layers_handle[1], bz, &block->layers[1],
			
 
				 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				 		}
			
 
				 
			
@@ -322,9 +324,9 @@ void allocate_memory_on_node(int rank)
 
				 		int top_node = block->boundary_blocks[T]->mpi_node;
			
 
				 		if ((node == rank) || (top_node == rank))
			
 
				 		{
			
 
				-			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][0], bz, &block->boundaries[T][0],
			
 
				 						(sizex + 2*K), (sizey + 2*K), K);
			
 
				-			allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[T][1], bz, &block->boundaries[T][1],
			
 
				 						(sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 
			
@@ -332,9 +334,9 @@ void allocate_memory_on_node(int rank)
 
				 		int bottom_node = block->boundary_blocks[B]->mpi_node;
			
 
				 		if ((node == rank) || (bottom_node == rank))
			
 
				 		{
			
 
				-			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][0], bz, &block->boundaries[B][0],
			
 
				 						(sizex + 2*K), (sizey + 2*K), K);
			
 
				-			allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
			
 
				+			allocate_block_on_node(&block->boundaries_handle[B][1], bz, &block->boundaries[B][1],
			
 
				 						(sizex + 2*K), (sizey + 2*K), K);
			
 
				 		}
			
 
				 	}
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2013-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2013-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2012, 2013, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -151,7 +151,8 @@ void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
 
				 
			
 
				 	}
			
 
				 	else
			
 
				-	{	/* node_z != local_rank, this MPI node doesn't have the saved data */
			
 
				+	{
			
 
				+		/* node_z != local_rank, this MPI node doesn't have the saved data */
			
 
				 		if (node_z_and_d == local_rank)
			
 
				 		{
			
 
				 			create_task_save_mpi_recv(iter, z, dir, local_rank);
			
@@ -296,6 +297,7 @@ void create_tasks(int rank)
 
				 
			
 
				 	for (iter = 0; iter <= niter; iter++)
			
 
				 	{
			
 
				+	     starpu_iteration_push(iter);
			
 
				 	     for (bz = 0; bz < nbz; bz++)
			
 
				 	     {
			
 
				 		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
			
@@ -313,6 +315,7 @@ void create_tasks(int rank)
 
				 				     create_task_save(iter, bz, -1, rank);
			
 
				 		     }
			
 
				 	     }
			
 
				+	     starpu_iteration_pop();
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				- * Copyright (C) 2010-2012, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2014, 2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -330,6 +330,7 @@ int main(int argc, char **argv)
 
				 		int iter;
			
 
				 		for (iter = 0; iter < who_runs_what_len; iter++)
			
 
				 		{
			
 
				+			starpu_iteration_push(iter);
			
 
				 			unsigned last, bz;
			
 
				 			last = 1;
			
 
				 			for (bz = 0; bz < nbz; bz++)
			
@@ -350,6 +351,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			FPRINTF(stderr, "\n");
			
 
				 
			
 
				+			starpu_iteration_pop();
			
 
				 			if (last)
			
 
				 				break;
			
 
				 		}
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -72,7 +72,8 @@
 
				 
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 /* Declared with `C' linkage in <gcc-plugin.h>.  */
			
--- a/include/pthread_win32/pthread.h
+++ b/include/pthread_win32/pthread.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010 Université Bordeaux
			
 
				- * Copyright (C) 2010  CNRS
			
 
				+ * Copyright (C) 2010, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -60,7 +60,8 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif /* __cplusplus */
			
 
				 
			
 
				 /***********
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				  * Copyright (C) 2016  Inria
			
 
				  *
			
@@ -53,6 +53,10 @@ struct starpu_data_descr
 
				 
			
 
				 struct starpu_data_interface_ops;
			
 
				 
			
 
				+void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
			
 
				+void starpu_data_set_coordinates_array(starpu_data_handle_t handle, int dimensions, int dims[]);
			
 
				+void starpu_data_set_coordinates(starpu_data_handle_t handle, unsigned dimensions, ...);
			
 
				+
			
 
				 void starpu_data_unregister(starpu_data_handle_t handle);
			
 
				 void starpu_data_unregister_no_coherency(starpu_data_handle_t handle);
			
 
				 void starpu_data_unregister_submit(starpu_data_handle_t handle);
			
@@ -69,6 +73,10 @@ int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_
 
				 int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
			
 
				 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
			
 
				 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
			
 
				+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency, long *pre_sync_jobid, long *post_sync_jobid);
			
 
				+
			
 
				+int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
			
 
				+int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode);
			
 
				 
			
 
				 #ifdef __GCC__
			
 
				 #  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013, 2015-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2011, 2013, 2015-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2013, 2014  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -40,8 +40,10 @@ struct starpu_fxt_codelet_event
 
				 struct starpu_fxt_options
			
 
				 {
			
 
				 	unsigned per_task_colour;
			
 
				+	unsigned no_events;
			
 
				 	unsigned no_counter;
			
 
				 	unsigned no_bus;
			
 
				+	unsigned no_flops;
			
 
				 	unsigned ninputfiles;
			
 
				 	unsigned no_smooth;
			
 
				 	char *filenames[STARPU_FXT_MAX_FILES];
			
@@ -50,6 +52,7 @@ struct starpu_fxt_options
 
				 	char *activity_path;
			
 
				 	char *dag_path;
			
 
				 	char *tasks_path;
			
 
				+	char *data_path;
			
 
				 	char *anim_path;
			
 
				 	char *states_path;
			
 
				 
			
--- a/include/starpu_sched_component.h
+++ b/include/starpu_sched_component.h
@@ -80,7 +80,6 @@ struct starpu_sched_tree
 
				 	struct starpu_sched_component *root;
			
 
				 	struct starpu_bitmap *workers;
			
 
				 	unsigned sched_ctx_id;
			
 
				-	struct starpu_sched_component *worker_components[STARPU_NMAXWORKERS];
			
 
				 	starpu_pthread_mutex_t lock;
			
 
				 };
			
 
				 
			
@@ -173,6 +172,8 @@ struct starpu_sched_component_perfmodel_select_data
 
				 struct starpu_sched_component *starpu_sched_component_perfmodel_select_create(struct starpu_sched_tree *tree, struct starpu_sched_component_perfmodel_select_data *perfmodel_select_data) STARPU_ATTRIBUTE_MALLOC;
			
 
				 int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *component);
			
 
				 
			
 
				+void starpu_initialize_prio_center_policy(unsigned sched_ctx_id);
			
 
				+
			
 
				 struct starpu_sched_component_composed_recipe;
			
 
				 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create(void) STARPU_ATTRIBUTE_MALLOC;
			
 
				 struct starpu_sched_component_composed_recipe *starpu_sched_component_composed_recipe_create_singleton(struct starpu_sched_component *(*create_component)(struct starpu_sched_tree *tree, void *arg), void *arg) STARPU_ATTRIBUTE_MALLOC;
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
@@ -57,15 +57,13 @@ struct starpu_sched_policy
 
				 struct starpu_sched_policy **starpu_sched_get_predefined_policies();
			
 
				 
			
 
				 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond);
			
 
				+unsigned long starpu_task_get_job_id(struct starpu_task *task);
			
 
				 
			
 
				 /* This function must be called to wake up a worker that is sleeping on the cond. 
			
 
				  * It returns 0 whenever the worker is not in a sleeping state */
			
 
				 int starpu_wake_worker(int workerid);
			
 
				-int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
			
 
				 int starpu_wake_worker_locked(int workerid);
			
 
				-/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
			
 
				-int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 
			
 
				 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -138,14 +138,17 @@ struct starpu_task
 
				 
			
 
				 	int nbuffers;
			
 
				 
			
 
				-	starpu_data_handle_t handles[STARPU_NMAXBUFS];
			
 
				-	void *interfaces[STARPU_NMAXBUFS];
			
 
				-	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
			
 
				-
			
 
				+        /* We keep these before the static arrays, so we can detect dyn_handles
			
 
				+	 * being NULL while nbuffers being bigger that STARPU_NMAXBUFS
			
 
				+	 * (otherwise the overflow would put a non-NULL) */
			
 
				 	starpu_data_handle_t *dyn_handles;
			
 
				 	void **dyn_interfaces;
			
 
				 	enum starpu_data_access_mode *dyn_modes;
			
 
				 
			
 
				+	starpu_data_handle_t handles[STARPU_NMAXBUFS];
			
 
				+	void *interfaces[STARPU_NMAXBUFS];
			
 
				+	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
			
 
				+
			
 
				 	void *cl_arg;
			
 
				 	size_t cl_arg_size;
			
 
				 
			
@@ -314,6 +317,9 @@ int starpu_task_wait_for_no_ready(void);
 
				 int starpu_task_nready(void);
			
 
				 int starpu_task_nsubmitted(void);
			
 
				 
			
 
				+void starpu_iteration_push(unsigned long iteration);
			
 
				+void starpu_iteration_pop(void);
			
 
				+
			
 
				 void starpu_do_schedule(void);
			
 
				 
			
 
				 void starpu_codelet_init(struct starpu_codelet *cl);
			
--- a/include/starpu_thread_util.h
+++ b/include/starpu_thread_util.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012-2014, 2016-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -130,6 +130,7 @@ int _starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex, char *fil
 
				 }
			
 
				 
			
 
				 #define STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
			
 
				+	_STARPU_CHECK_NOT_SCHED_MUTEX(mutex, __FILE__, __LINE__);              \
			
 
				 	int p_ret = starpu_pthread_mutex_unlock(mutex);                        \
			
 
				 	if (STARPU_UNLIKELY(p_ret)) {                                          \
			
 
				 		fprintf(stderr,                                                \
			
@@ -137,7 +138,6 @@ int _starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex, char *fil
 
				 			__FILE__, __LINE__, strerror(p_ret));                  \
			
 
				 		STARPU_ABORT();                                                \
			
 
				 	}                                                                      \
			
 
				-	_STARPU_CHECK_NOT_SCHED_MUTEX(mutex, __FILE__, __LINE__);                                  \
			
 
				 } while (0)
			
 
				 
			
 
				 #define STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex) do {                          \
			
--- a/mpi/examples/comm/comm.c
+++ b/mpi/examples/comm/comm.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -47,8 +47,18 @@ int main(int argc, char **argv)
 
				 	int rank, newrank;
			
 
				 	int ret;
			
 
				 	starpu_data_handle_t data[2];
			
 
				+	int thread_support;
			
 
				+
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				+		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				 
			
 
				-        MPI_Init(&argc, &argv);
			
 
				         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				         MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/examples/comm/mix_comm.c
+++ b/mpi/examples/comm/mix_comm.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015  CNRS
			
 
				+ * Copyright (C) 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -48,8 +48,17 @@ int main(int argc, char **argv)
 
				 	int ret;
			
 
				 	starpu_data_handle_t data[3];
			
 
				 	int value = 90;
			
 
				+	int thread_support;
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				+		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				 
			
 
				-        MPI_Init(&argc, &argv);
			
 
				         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				         MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -18,6 +18,7 @@
 
				 #include "mpi_cholesky.h"
			
 
				 #include <common/blas.h>
			
 
				 #include <sys/time.h>
			
 
				+#include <limits.h>
			
 
				 
			
 
				 /*
			
 
				  *	Create the codelets
			
@@ -73,6 +74,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_data_handle_t **data_handles;
			
 
				 	unsigned x,y,i,j,k;
			
 
				 
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				 	/* create all the DAG nodes */
			
 
				 
			
 
				 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
@@ -89,7 +92,9 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
			
 
				 						ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				 			}
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO: make better test to only register what is needed
			
 
				+#endif
			
 
				 			else
			
 
				 			{
			
 
				 				/* I don't own that index, but will need it for my computations */
			
@@ -99,6 +104,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 			}
			
 
				 			if (data_handles[x][y])
			
 
				 			{
			
 
				+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
			
 
				 				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
			
 
				 			}
			
 
				 		}
			
@@ -109,20 +115,17 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				-		int prio = STARPU_DEFAULT_PRIO;
			
 
				-		if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				+		starpu_iteration_push(k);
			
 
				 
			
 
				 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				-				       STARPU_PRIORITY, prio,
			
 
				+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 				       STARPU_RW, data_handles[k][k],
			
 
				 				       0);
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				-			prio = STARPU_DEFAULT_PRIO;
			
 
				-			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				 			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				-					       STARPU_PRIORITY, prio,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 					       STARPU_R, data_handles[k][k],
			
 
				 					       STARPU_RW, data_handles[k][j],
			
 
				 					       0);
			
@@ -135,10 +138,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 			{
			
 
				 				if (i <= j)
			
 
				 				{
			
 
				-					prio = STARPU_DEFAULT_PRIO;
			
 
				-					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				-							       STARPU_PRIORITY, prio,
			
 
				+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 							       STARPU_R, data_handles[k][i],
			
 
				 							       STARPU_R, data_handles[k][j],
			
 
				 							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
			
@@ -150,6 +151,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 			if (my_distrib(k, j, nodes) == rank)
			
 
				 				starpu_data_wont_use(data_handles[k][j]);
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
--- a/mpi/examples/matrix_mult/mm.c
+++ b/mpi/examples/matrix_mult/mm.c
@@ -93,7 +93,8 @@ static void disp_matrix(double *m)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void check_result(void) {
			
 
				+static void check_result(void)
			
 
				+{
			
 
				 	int row,col;
			
 
				 	for (row = 0; row < N; row++)
			
 
				 	{
			
@@ -131,7 +132,8 @@ static void register_matrices()
 
				 
			
 
				 	int b_row,b_col;
			
 
				 
			
 
				-	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				 		/* Register a block to StarPU */
			
 
				 		starpu_matrix_data_register(&A_h[b_row],
			
 
				 				mr,
			
@@ -144,23 +146,29 @@ static void register_matrices()
 
				 		 * Note: StarPU-MPI is an autonomous layer built on top of StarPU, hence the two separate
			
 
				 		 * registration steps.
			
 
				 		 */
			
 
				+		starpu_data_set_coordinates(A_h[b_row], 2, 0, b_row);
			
 
				 		starpu_mpi_data_register(A_h[b_row], tag++, 0);
			
 
				 	}
			
 
				 
			
 
				-	for (b_col = 0; b_col < NB; b_col++) {
			
 
				+	for (b_col = 0; b_col < NB; b_col++)
			
 
				+	{
			
 
				 		starpu_matrix_data_register(&B_h[b_col],
			
 
				 				mr,
			
 
				 				(comm_rank == 0)?(uintptr_t)(B+b_col*BS):0, N, BS, N,
			
 
				 				sizeof(double));
			
 
				+		starpu_data_set_coordinates(B_h[b_col], 2, b_col, 0);
			
 
				 		starpu_mpi_data_register(B_h[b_col], tag++, 0);
			
 
				 	}
			
 
				 
			
 
				-	for (b_row = 0; b_row < NB; b_row++) {
			
 
				-		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				 			starpu_matrix_data_register(&C_h[b_row*NB+b_col],
			
 
				 					mr,
			
 
				 					(comm_rank == 0)?(uintptr_t)(C+b_row*BS*N+b_col*BS):0, N, BS, BS,
			
 
				 					sizeof(double));
			
 
				+			starpu_data_set_coordinates(C_h[b_row*NB+b_col], 2, b_col, b_row);
			
 
				 			starpu_mpi_data_register(C_h[b_row*NB+b_col], tag++, 0);
			
 
				 		}
			
 
				 	}
			
@@ -191,8 +199,10 @@ static void distribute_matrix_C(void)
 
				 static void undistribute_matrix_C(void)
			
 
				 {
			
 
				 	int b_row,b_col;
			
 
				-	for (b_row = 0; b_row < NB; b_row++) {
			
 
				-		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				 			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
			
 
				 			starpu_mpi_data_migrate(MPI_COMM_WORLD, h, 0);
			
 
				 		}
			
@@ -204,16 +214,20 @@ static void unregister_matrices()
 
				 {
			
 
				 	int b_row,b_col;
			
 
				 
			
 
				-	for (b_row = 0; b_row < NB; b_row++) {
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				 		starpu_data_unregister(A_h[b_row]);
			
 
				 	}
			
 
				 
			
 
				-	for (b_col = 0; b_col < NB; b_col++) {
			
 
				+	for (b_col = 0; b_col < NB; b_col++)
			
 
				+	{
			
 
				 		starpu_data_unregister(B_h[b_col]);
			
 
				 	}
			
 
				 
			
 
				-	for (b_row = 0; b_row < NB; b_row++) {
			
 
				-		for (b_col = 0; b_col < NB; b_col++) {
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				 			starpu_data_unregister(C_h[b_row*NB+b_col]);
			
 
				 		}
			
 
				 	}
			
@@ -249,9 +263,12 @@ static void cpu_mult(void *handles[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 	assert(n_col_A == n_row_B);
			
 
				 
			
 
				 	unsigned i,j,k;
			
 
				-	for (k = 0; k < n_row_C; k++) {
			
 
				-		for (j = 0; j < n_col_C; j++) {
			
 
				-			for (i = 0; i < n_col_A; i++) {
			
 
				+	for (k = 0; k < n_row_C; k++)
			
 
				+	{
			
 
				+		for (j = 0; j < n_col_C; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < n_col_A; i++)
			
 
				+			{
			
 
				 				block_C[k*ld_C+j] += block_A[k*ld_A+i] * block_B[i*ld_B+j]; 
			
 
				 			}
			
 
				 
			
@@ -293,16 +310,20 @@ int main(int argc, char *argv[])
 
				 	}
			
 
				 
			
 
				 	/* Parse the matrix size and block size optional args */
			
 
				-	if (argc > 1) {
			
 
				+	if (argc > 1)
			
 
				+	{
			
 
				 		N = atoi(argv[1]);
			
 
				-		if (N < 1) {
			
 
				+		if (N < 1)
			
 
				+		{
			
 
				 			fprintf(stderr, "invalid matrix size\n");
			
 
				 			exit(1);
			
 
				 		}
			
 
				-		if (argc > 2) {
			
 
				+		if (argc > 2)
			
 
				+		{
			
 
				 			BS = atoi(argv[2]);
			
 
				 		}
			
 
				-		if (BS < 1 || N % BS != 0) {
			
 
				+		if (BS < 1 || N % BS != 0)
			
 
				+		{
			
 
				 			fprintf(stderr, "invalid block size\n");
			
 
				 			exit(1);
			
 
				 		}
			
@@ -353,7 +374,8 @@ int main(int argc, char *argv[])
 
				 	undistribute_matrix_C();
			
 
				 	unregister_matrices();
			
 
				 
			
 
				-	if (comm_rank == 0) {
			
 
				+	if (comm_rank == 0)
			
 
				+	{
			
 
				 #if VERBOSE
			
 
				 		disp_matrix(C);
			
 
				 #endif
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013, 2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				+ * Copyright (C) 2010-2011, 2013, 2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -70,26 +70,32 @@ static TYPE **(tmp_21_block[2]);
 
				 static void parse_args(int rank, int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				 			display = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_LIBNUMA
			
 
				 			numa = 1;
			
 
				 #else
			
@@ -98,17 +104,20 @@ static void parse_args(int rank, int argc, char **argv)
 
				 #endif
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			p = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			q = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				 			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				 			exit(0);
			
@@ -249,8 +258,10 @@ static void init_matrix(int rank)
 
				 				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
			
 
				 					(uintptr_t)*blockptr, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+				starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				*blockptr = STARPU_POISON_PTR;
			
 
				 				*handleptr = STARPU_POISON_PTR;
			
 
				 			}
			
@@ -296,7 +307,8 @@ static void init_matrix(int rank)
 
				 
			
 
				 	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				 #else
			
 
				-	for (i = 0; i < 2; i++) {
			
 
				+	for (i = 0; i < 2; i++)
			
 
				+	{
			
 
				 		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				 		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				 		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
			
@@ -331,7 +343,8 @@ static void init_matrix(int rank)
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
 
				 #else
			
 
				-	for (i = 0; i < 2; i++) {
			
 
				+	for (i = 0; i < 2; i++)
			
 
				+	{
			
 
				 		if (tmp_12_block_is_needed(rank, nblocks, k))
			
 
				 		{
			
 
				 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
			
@@ -410,7 +423,8 @@ int main(int argc, char **argv)
 
				 	 *	Initialization
			
 
				 	 */
			
 
				 	int thread_support;
			
 
				-	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				 		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				 		exit(1);
			
 
				 	}
			
--- a/mpi/examples/mpi_lu/plu_implicit_example.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010-2011, 2013, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -51,26 +51,32 @@ int get_block_rank(unsigned i, unsigned j);
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				 			display = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_LIBNUMA
			
 
				 			numa = 1;
			
 
				 #else
			
@@ -78,17 +84,20 @@ static void parse_args(int argc, char **argv)
 
				 #endif
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			p = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			q = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				 			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				 			exit(0);
			
@@ -168,12 +177,14 @@ static void init_matrix(int rank)
 
				 					(uintptr_t)*blockptr, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				starpu_matrix_data_register(handleptr, -1,
			
 
				 					0, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 				*blockptr = STARPU_POISON_PTR;
			
 
				 			}
			
 
				+			starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				 			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
			
 
				 		}
			
 
				 	}
			
--- a/mpi/examples/mpi_lu/plu_outofcore_example.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010-2011, 2013-2014, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -53,26 +53,32 @@ int get_block_rank(unsigned i, unsigned j);
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				 			display = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_LIBNUMA
			
 
				 			numa = 1;
			
 
				 #else
			
@@ -80,20 +86,25 @@ static void parse_args(int argc, char **argv)
 
				 #endif
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			p = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			q = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(argv[i], "-path") == 0)
			
 
				+		{
			
 
				 			path = argv[++i];
			
 
				+		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
			
 
				 			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				 			exit(0);
			
@@ -147,15 +158,18 @@ static void create_matrix()
 
				 			}
			
 
				 			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
			
 
				 			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
			
 
				-			if (fd < 0) {
			
 
				+			if (fd < 0)
			
 
				+			{
			
 
				 				perror("open");
			
 
				 				exit(1);
			
 
				 			}
			
 
				-			if (write(fd, blockptr, blocksize) != (starpu_ssize_t) blocksize) {
			
 
				+			if (write(fd, blockptr, blocksize) != (starpu_ssize_t) blocksize)
			
 
				+			{
			
 
				 				fprintf(stderr,"short write");
			
 
				 				exit(1);
			
 
				 			}
			
 
				-			if (close(fd) < 0) {
			
 
				+			if (close(fd) < 0)
			
 
				+			{
			
 
				 				perror("close");
			
 
				 				exit(1);
			
 
				 			}
			
@@ -193,7 +207,8 @@ static void init_matrix(int rank)
 
				 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
			
 
				 				/* Register it to StarPU */
			
 
				 				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
			
 
				-				if (!disk_obj) {
			
 
				+				if (!disk_obj)
			
 
				+				{
			
 
				 					fprintf(stderr,"could not open %s\n", filename);
			
 
				 					exit(1);
			
 
				 				}
			
@@ -201,11 +216,13 @@ static void init_matrix(int rank)
 
				 					(uintptr_t) disk_obj, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				starpu_matrix_data_register(handleptr, -1,
			
 
				 					0, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 			}
			
 
				+			starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				 			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
			
 
				 		}
			
 
				 	}
			
@@ -243,7 +260,8 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				 	ret = mkdir(path, 0777);
			
 
				-	if (ret != 0 && errno != EEXIST) {
			
 
				+	if (ret != 0 && errno != EEXIST)
			
 
				+	{
			
 
				 		fprintf(stderr,"%s does not exist\n", path);
			
 
				 		exit(1);
			
 
				 	}
			
--- a/mpi/examples/mpi_lu/plu_solve.c
+++ b/mpi/examples/mpi_lu/plu_solve.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -247,7 +247,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			block = STARPU_PLU(get_block)(bi, bj);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			MPI_Status status;
			
 
				 
			
 
				 			if (rank == 0)
			
@@ -257,7 +258,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
				 				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
			
 
				 				STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 			}
			
 
				-			else if (rank == block_rank) {
			
 
				+			else if (rank == block_rank)
			
 
				+			{
			
 
				 				block = STARPU_PLU(get_block)(bi, bj);
			
 
				 				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
			
 
				 				STARPU_ASSERT(ret == MPI_SUCCESS);
			
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2014, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -58,7 +58,8 @@ static unsigned nblocks = 0;
 
				 static int rank = -1;
			
 
				 static int world_size = -1;
			
 
				 
			
 
				-struct callback_arg {
			
 
				+struct callback_arg
			
 
				+{
			
 
				 	unsigned i, j, k;
			
 
				 };
			
 
				 
			
@@ -104,7 +105,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 	int r;
			
 
				 	for (r = 0; r < world_size; r++)
			
 
				 	{
			
 
				-		if (rank_mask[r]) {
			
 
				+		if (rank_mask[r])
			
 
				+		{
			
 
				 			rank_array[cnt] = r;
			
 
				 
			
 
				 			comm_array[cnt] = MPI_COMM_WORLD;
			
@@ -120,7 +122,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 		 * once */
			
 
				 		starpu_tag_notify_from_apps(tag);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
			
 
				 				rank_array, mpi_tag_array, comm_array, tag);
			
 
				 	}
			
@@ -129,7 +132,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 /* Initiate a receive request once all dependencies are fulfilled and unlock
			
 
				  * tag 'unlocked_tag' once it's done. */
			
 
				 
			
 
				-struct recv_when_done_callback_arg {
			
 
				+struct recv_when_done_callback_arg
			
 
				+{
			
 
				 	int source;
			
 
				 	int mpi_tag;
			
 
				 	starpu_data_handle_t handle;
			
@@ -156,7 +160,7 @@ static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
 
				 
			
 
				 	struct recv_when_done_callback_arg *arg =
			
 
				 		malloc(sizeof(struct recv_when_done_callback_arg));
			
 
				-	
			
 
				+
			
 
				 	arg->source = source;
			
 
				 	arg->mpi_tag = mpi_tag;
			
 
				 	arg->handle = handle;
			
@@ -186,7 +190,7 @@ static void create_task_11_recv(unsigned k)
 
				 	 * 21(k-1)i with i,j >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[2*nblocks];
			
 
				-	
			
 
				+
			
 
				 #ifdef SINGLE_TMP11
			
 
				 	unsigned i, j;
			
 
				 	if (k > 0)
			
@@ -203,7 +207,7 @@ static void create_task_11_recv(unsigned k)
 
				 			tag_array[ndeps++] = TAG12(k-1, j);
			
 
				 	}
			
 
				 #endif
			
 
				-	
			
 
				+
			
 
				 	int source = get_block_rank(k, k);
			
 
				 #ifdef SINGLE_TMP11
			
 
				 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -254,7 +258,7 @@ static void callback_task_11_real(void *_arg)
 
				 	starpu_tag_t tag = TAG11_SAVE(k);
			
 
				 	int mpi_tag = MPI_TAG11(k);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
@@ -280,10 +284,12 @@ static void create_task_11_real(unsigned k)
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
			
 
				 	}
			
 
				 
			
@@ -296,25 +302,27 @@ static void create_task_11(unsigned k)
 
				 	if (get_block_rank(k, k) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
			
 
				+		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
			
 
				 #endif
			
 
				 		create_task_11_real(k);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_11(k, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
			
 
				+			fprintf(stderr, "create RECV task 11(%u) on node %d\n", k, rank);
			
 
				 #endif
			
 
				 			create_task_11_recv(k);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
			
 
				+			fprintf(stderr, "Node %d needs not 11(%u)\n", rank, k);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -338,7 +346,7 @@ static void create_task_12_recv(unsigned k, unsigned j)
 
				 	 * i >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[nblocks];
			
 
				-	
			
 
				+
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 	if (k > 0)
			
 
				 	for (i = (k-1)+1; i < nblocks; i++)
			
@@ -354,7 +362,7 @@ static void create_task_12_recv(unsigned k, unsigned j)
 
				 			tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				 #endif
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	int source = get_block_rank(k, j);
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
			
@@ -398,15 +406,17 @@ static void callback_task_12_real(void *_arg)
 
				 	starpu_tag_t tag = TAG12_SAVE(k, j);
			
 
				 	int mpi_tag = MPI_TAG12(k, j);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
 
				 static void create_task_12_real(unsigned k, unsigned j)
			
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG12(k, j));
			
 
				-	
			
 
				+
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix :/
			
 
				+#endif
			
 
				 //	task->cl = &STARPU_PLU(cl12);
			
 
				 	task->cl = &STARPU_PLU(cl21);
			
 
				 
			
@@ -414,7 +424,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 
			
 
				 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				 
			
 
				-	starpu_tag_t tag_11_dep; 
			
 
				+	starpu_tag_t tag_11_dep;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 	starpu_data_handle_t diag_block;
			
@@ -423,7 +433,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 		tag_11_dep = TAG11(k);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP11
			
 
				 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -433,8 +443,8 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 		tag_11_dep = TAG11_SAVE(k);
			
 
				 	}
			
 
				 
			
 
				-	task->handles[0] = diag_block; 
			
 
				-	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
			
 
				+	task->handles[0] = diag_block;
			
 
				+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j);
			
 
				 
			
 
				 	STARPU_ASSERT(get_block_rank(k, j) == rank);
			
 
				 
			
@@ -448,15 +458,18 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 	task->callback_func = callback_task_12_real;
			
 
				 	task->callback_arg = arg;
			
 
				 
			
 
				-	if (!no_prio && (j == k+1)) {
			
 
				+	if (!no_prio && (j == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
			
 
				 	}
			
 
				 
			
@@ -469,25 +482,27 @@ static void create_task_12(unsigned k, unsigned j)
 
				 	if (get_block_rank(k, j) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+		fprintf(stderr, "CREATE real task 12(k = %u, j = %u) on node %d\n", k, j, rank);
			
 
				 #endif
			
 
				 		create_task_12_real(k, j);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_12(k, j, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+			fprintf(stderr, "create RECV task 12(k = %u, j = %u) on node %d\n", k, j, rank);
			
 
				 #endif
			
 
				 			create_task_12_recv(k, j);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
			
 
				+			fprintf(stderr, "Node %d needs not 12(k=%u, i=%u)\n", rank, k, j);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -509,7 +524,7 @@ static void create_task_21_recv(unsigned k, unsigned i)
 
				 	 * j >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[nblocks];
			
 
				-	
			
 
				+
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 	if (k > 0)
			
 
				 	for (j = (k-1)+1; j < nblocks; j++)
			
@@ -570,7 +585,7 @@ static void callback_task_21_real(void *_arg)
 
				 	starpu_tag_t tag = TAG21_SAVE(k, i);
			
 
				 	int mpi_tag = MPI_TAG21(k, i);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
@@ -578,7 +593,9 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG21(k, i));
			
 
				 
			
 
				-#warning temporary fix 
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning temporary fix
			
 
				+#endif
			
 
				 //	task->cl = &STARPU_PLU(cl21);
			
 
				 	task->cl = &STARPU_PLU(cl12);
			
 
				 
			
@@ -586,8 +603,8 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 
			
 
				 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				 
			
 
				-	starpu_tag_t tag_11_dep; 
			
 
				-	
			
 
				+	starpu_tag_t tag_11_dep;
			
 
				+
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 	starpu_data_handle_t diag_block;
			
 
				 	if (diag_block_is_local)
			
@@ -595,7 +612,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 		tag_11_dep = TAG11(k);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP11
			
 
				 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -605,7 +622,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 		tag_11_dep = TAG11_SAVE(k);
			
 
				 	}
			
 
				 
			
 
				-	task->handles[0] = diag_block; 
			
 
				+	task->handles[0] = diag_block;
			
 
				 	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
			
 
				 
			
 
				 	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
			
@@ -618,15 +635,18 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 	task->callback_func = callback_task_21_real;
			
 
				 	task->callback_arg = arg;
			
 
				 
			
 
				-	if (!no_prio && (i == k+1)) {
			
 
				+	if (!no_prio && (i == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
			
 
				 	}
			
 
				 
			
@@ -639,25 +659,27 @@ static void create_task_21(unsigned k, unsigned i)
 
				 	if (get_block_rank(i, k) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+		fprintf(stderr, "CREATE real task 21(k = %u, i = %u) on node %d\n", k, i, rank);
			
 
				 #endif
			
 
				 		create_task_21_real(k, i);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_21(k, i, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+			fprintf(stderr, "create RECV task 21(k = %u, i = %u) on node %d\n", k, i, rank);
			
 
				 #endif
			
 
				 			create_task_21_recv(k, i);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
			
 
				+			fprintf(stderr, "Node %d needs not 21(k=%u, i=%u)\n", rank, k,i);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -679,7 +701,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 
			
 
				-	/* produced by TAG21_SAVE(k, i) */ 
			
 
				+	/* produced by TAG21_SAVE(k, i) */
			
 
				 	unsigned block21_is_local = (get_block_rank(i, k) == rank);
			
 
				 	starpu_tag_t tag_21_dep;
			
 
				 
			
@@ -689,7 +711,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 		block21 = STARPU_PLU(get_block_handle)(i, k);
			
 
				 		tag_21_dep = TAG21(k, i);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
			
@@ -710,7 +732,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 		block12 = STARPU_PLU(get_block_handle)(k, j);
			
 
				 		tag_12_dep = TAG12(k, j);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
			
@@ -722,7 +744,9 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix :/
			
 
				+#endif
			
 
				 	//task->handles[0] = block21;
			
 
				 	task->handles[0] = block12;
			
 
				 
			
@@ -736,15 +760,18 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
			
 
				 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
			
 
				 
			
 
				-	if (!no_prio && (i == k + 1) && (j == k +1) ) {
			
 
				+	if (!no_prio && (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
			
 
				 	}
			
 
				 
			
@@ -759,7 +786,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
			
 
				 		create_task_22_real(k, i, j);
			
 
				 	}
			
 
				-//	else {
			
 
				+//	else
			
 
				+//	{
			
 
				 //		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
			
 
				 //	}
			
 
				 }
			
@@ -787,7 +815,7 @@ static void wait_termination(void)
 
				 			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 
			
 
				 		for (i = k + 1; i < nblocks; i++)
			
 
				 		{
			
@@ -812,11 +840,11 @@ static void wait_termination(void)
 
				 				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
			
 
				 			}
			
 
				 		}
			
 
				-	}	
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  */
			
 
				 
			
 
				 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
@@ -833,6 +861,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				 		create_task_11(k);
			
 
				 
			
 
				 		for (i = k+1; i<nblocks; i++)
			
@@ -848,6 +878,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 				create_task_22(k, i, j);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
@@ -859,12 +890,12 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
			
 
				 
			
 
				 	wait_termination();
			
 
				-	
			
 
				+
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	double timing = end - start;
			
 
				-	
			
 
				+
			
 
				 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				-	
			
 
				+
			
 
				 	return timing;
			
 
				 }
			
--- a/mpi/examples/mpi_lu/pxlu_implicit.c
+++ b/mpi/examples/mpi_lu/pxlu_implicit.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -29,7 +29,8 @@ static unsigned nblocks = 0;
 
				 static int rank = -1;
			
 
				 static int world_size = -1;
			
 
				 
			
 
				-struct callback_arg {
			
 
				+struct callback_arg
			
 
				+{
			
 
				 	unsigned i, j, k;
			
 
				 };
			
 
				 
			
@@ -56,7 +57,9 @@ static void create_task_11(unsigned k)
 
				 
			
 
				 static void create_task_12(unsigned k, unsigned j)
			
 
				 {
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix 
			
 
				+#endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl12),
			
 
				 			       &STARPU_PLU(cl21),
			
@@ -76,7 +79,9 @@ static void create_task_12(unsigned k, unsigned j)
 
				 
			
 
				 static void create_task_21(unsigned k, unsigned i)
			
 
				 {
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix 
			
 
				+#endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl21),
			
 
				 			       &STARPU_PLU(cl12),
			
@@ -131,6 +136,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				 		create_task_11(k);
			
 
				 
			
 
				 		for (i = k+1; i<nblocks; i++)
			
@@ -160,6 +167,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 			if (get_block_rank(i, k) == _rank)
			
 
				 				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012  CNRS
			
 
				+ * Copyright (C) 2010, 2012, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -45,7 +45,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 
			
 
				 	int rank;
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u\n", rank, info->k, info->i, info->j);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -53,7 +53,8 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 	cudaError_t cures;
			
 
				 #endif
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_GEMM("N", "N", dy, dx, dz,
			
 
				 				(TYPE)-1.0, right, ld21, left, ld12,
			
@@ -80,7 +81,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 			break;
			
 
				 	}
			
 
				 #ifdef VERBOSE_KERNELS
			
 
				-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u done\n", rank, info->k, info->i, info->j);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -96,7 +97,8 @@ static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
 
				 }
			
 
				 #endif// STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_22) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_22) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
			
@@ -107,7 +109,8 @@ static struct starpu_perfmodel STARPU_PLU(model_22) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl22) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl22) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u22)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -144,8 +147,8 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
				 	int rank;
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 #warning fixed debugging according to other tweak
			
 
				-	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
			
 
				-	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
			
 
				+	//fprintf(stderr, "KERNEL 12 %d - k = %u i %u\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 21 %d - k = %u i %u\n", rank, info->k, info->j);
			
 
				 
			
 
				 	//fprintf(stderr, "INPUT 12 U11\n");
			
 
				 	fprintf(stderr, "INPUT 21 U11\n");
			
@@ -161,7 +164,8 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
				 #endif
			
 
				 
			
 
				 	/* solve L11 U12 = A12 (find U12) */
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
			
 
				 					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
@@ -204,7 +208,8 @@ static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_12) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_12) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
			
@@ -215,7 +220,8 @@ static struct starpu_perfmodel STARPU_PLU(model_12) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl12) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl12) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u12)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -252,8 +258,8 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
				 	int rank;
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 #warning fixed debugging according to other tweak
			
 
				-	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
			
 
				-	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
			
 
				+	//fprintf(stderr, "KERNEL 21 %d (k = %u, i = %u)\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 12 %d (k = %u, j = %u)\n", rank, info->k, info->j);
			
 
				 
			
 
				 	//fprintf(stderr, "INPUT 21 U11\n");
			
 
				 	fprintf(stderr, "INPUT 12 U11\n");
			
@@ -268,7 +274,8 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
				 #endif
			
 
				 
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
			
 
				 					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
@@ -313,7 +320,8 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_21) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_21) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
			
@@ -324,7 +332,8 @@ static struct starpu_perfmodel STARPU_PLU(model_21) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl21) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl21) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u21)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -357,10 +366,11 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 
			
 
				 	int rank;
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
			
 
				 #endif
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
@@ -403,7 +413,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 			break;
			
 
				 	}
			
 
				 #ifdef VERBOSE_KERNELS
			
 
				-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -419,7 +429,8 @@ static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
 
				 }
			
 
				 #endif// STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_11) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_11) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
			
@@ -430,7 +441,8 @@ static struct starpu_perfmodel STARPU_PLU(model_11) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl11) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl11) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u11)},
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2013, 2015-2016              Université Bordeaux
			
 
				+ * Copyright (C) 2011, 2013, 2015-2017              Université Bordeaux
			
 
				  * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -166,6 +166,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			if (data_handles[x][y])
			
 
				 			{
			
 
				+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
			
 
				 				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				 			}
			
 
				 		}
			
@@ -174,6 +175,8 @@ int main(int argc, char **argv)
 
				 	/* First computation with initial distribution */
			
 
				 	for(loop=0 ; loop<niter; loop++)
			
 
				 	{
			
 
				+		starpu_iteration_push(loop);
			
 
				+
			
 
				 		for (x = 1; x < X-1; x++)
			
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
@@ -184,6 +187,7 @@ int main(int argc, char **argv)
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
@@ -213,6 +217,8 @@ int main(int argc, char **argv)
 
				 	/* Second computation with new distribution */
			
 
				 	for(loop=0 ; loop<niter; loop++)
			
 
				 	{
			
 
				+		starpu_iteration_push(niter + loop);
			
 
				+
			
 
				 		for (x = 1; x < X-1; x++)
			
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
@@ -223,6 +229,7 @@ int main(int argc, char **argv)
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
--- a/mpi/examples/stencil/stencil5_lb.c
+++ b/mpi/examples/stencil/stencil5_lb.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2013, 2015-2016              Université Bordeaux
			
 
				+ * Copyright (C) 2011, 2013, 2015-2017              Université Bordeaux
			
 
				  * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -227,6 +227,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			if (data_nodes[x][y].data_handle)
			
 
				 			{
			
 
				+				starpu_data_set_coordinates(data_nodes[x][y].data_handle, 2, x, y);
			
 
				 				starpu_mpi_data_register(data_nodes[x][y].data_handle, (y*X)+x, data_nodes[x][y].node);
			
 
				 			}
			
 
				 		}
			
@@ -235,6 +236,8 @@ int main(int argc, char **argv)
 
				 	/* First computation with initial distribution */
			
 
				 	for(loop=0 ; loop<niter; loop++)
			
 
				 	{
			
 
				+		starpu_iteration_push(loop);
			
 
				+
			
 
				 		for (x = 1; x < X-1; x++)
			
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
@@ -246,6 +249,7 @@ int main(int argc, char **argv)
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -26,7 +26,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 typedef void *starpu_mpi_req;
			
--- a/mpi/include/starpu_mpi_lb.h
+++ b/mpi/include/starpu_mpi_lb.h
@@ -19,7 +19,8 @@
 
				 #define __STARPU_MPI_LOAD_BALANCER_H__
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 struct starpu_mpi_lb_conf
			
--- a/mpi/src/load_balancer/policy/data_movements_interface.c
+++ b/mpi/src/load_balancer/policy/data_movements_interface.c
@@ -17,6 +17,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <stdlib.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				 
			
 
				 #include "data_movements_interface.h"
			
 
				 
			
@@ -91,8 +92,8 @@ int data_movements_reallocate_tables(starpu_data_handle_t handle, int size)
 
				 
			
 
				 	if (dm_interface->size)
			
 
				 	{
			
 
				-		dm_interface->tags = malloc(size*sizeof(int));
			
 
				-		dm_interface->ranks = malloc(size*sizeof(int));
			
 
				+		_STARPU_MPI_MALLOC(dm_interface->tags, size*sizeof(int));
			
 
				+		_STARPU_MPI_MALLOC(dm_interface->ranks, size*sizeof(int));
			
 
				 	}
			
 
				 
			
 
				 	return 0 ;
			
--- a/mpi/src/load_balancer/policy/load_balancer_policy.h
+++ b/mpi/src/load_balancer/policy/load_balancer_policy.h
@@ -21,7 +21,8 @@
 
				 #include <starpu_mpi_lb.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 /* A load balancer consists in a collection of operations on a data
			
--- a/mpi/src/load_balancer/policy/load_heat_propagation.c
+++ b/mpi/src/load_balancer/policy/load_heat_propagation.c
@@ -20,7 +20,7 @@
 
				 #include <common/uthash.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <math.h>
			
 
				-
			
 
				+#include <starpu_mpi_private.h>
			
 
				 #include "load_balancer_policy.h"
			
 
				 #include "data_movements_interface.h"
			
 
				 #include "load_data_interface.h"
			
@@ -259,7 +259,8 @@ static void update_data_ranks()
 
				 				/* Save the fact that the data has been moved out of this node */
			
 
				 				if (i == my_rank)
			
 
				 				{
			
 
				-					struct moved_data_entry *md = (struct moved_data_entry *)malloc(sizeof(struct moved_data_entry));
			
 
				+					struct moved_data_entry *md;
			
 
				+					_STARPU_MPI_MALLOC(md, sizeof(struct moved_data_entry));
			
 
				 					md->handle = handle;
			
 
				 					HASH_ADD_PTR(mdh, handle, md);
			
 
				 				}
			
@@ -416,7 +417,7 @@ static int init_heat(struct starpu_mpi_lb_conf *itf)
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				-	user_itf = malloc(sizeof(struct starpu_mpi_lb_conf));
			
 
				+	_STARPU_MPI_MALLOC(user_itf, sizeof(struct starpu_mpi_lb_conf));
			
 
				 	memcpy(user_itf, itf, sizeof(struct starpu_mpi_lb_conf));;
			
 
				 
			
 
				 	/* Get the neighbors of the local MPI node */
			
@@ -462,22 +463,19 @@ static int init_heat(struct starpu_mpi_lb_conf *itf)
 
				 	 * step. */
			
 
				 
			
 
				 	/* Local load data */
			
 
				-	load_data_handle = malloc(sizeof(starpu_data_handle_t));
			
 
				-	memset(load_data_handle, 0, sizeof(starpu_data_handle_t));
			
 
				+	_STARPU_MPI_CALLOC(load_data_handle, 1, sizeof(starpu_data_handle_t));
			
 
				 	load_data_data_register(load_data_handle, STARPU_MAIN_RAM, sleep_task_threshold, wakeup_ratio);
			
 
				 
			
 
				 	/* Copy of the local load data to enable parallel update of the load data
			
 
				 	 * with communications to neighbor nodes */
			
 
				-	load_data_handle_cpy = malloc(sizeof(starpu_data_handle_t));
			
 
				-	memset(load_data_handle_cpy, 0, sizeof(starpu_data_handle_t));
			
 
				+	_STARPU_MPI_CALLOC(load_data_handle_cpy, 1, sizeof(starpu_data_handle_t));
			
 
				 	void *local_interface = starpu_data_get_interface_on_node(*load_data_handle, STARPU_MAIN_RAM);
			
 
				 	struct starpu_data_interface_ops *itf_load_data = starpu_data_get_interface_ops(*load_data_handle);
			
 
				 	starpu_data_register(load_data_handle_cpy, STARPU_MAIN_RAM, local_interface, itf_load_data);
			
 
				 	starpu_mpi_data_register(*load_data_handle_cpy, TAG_LOAD(my_rank), my_rank);
			
 
				 
			
 
				 	/* Remote load data */
			
 
				-	neighbor_load_data_handles = malloc(nneighbors*sizeof(starpu_data_handle_t));
			
 
				-	memset(neighbor_load_data_handles, 0, nneighbors*sizeof(starpu_data_handle_t));
			
 
				+	_STARPU_MPI_CALLOC(neighbor_load_data_handles, nneighbors, sizeof(starpu_data_handle_t));
			
 
				 	for (i = 0; i < nneighbors; i++)
			
 
				 	{
			
 
				 		load_data_data_register(&neighbor_load_data_handles[i], STARPU_MAIN_RAM, sleep_task_threshold, wakeup_ratio);
			
@@ -485,7 +483,7 @@ static int init_heat(struct starpu_mpi_lb_conf *itf)
 
				 	}
			
 
				 
			
 
				 	/* Data movements handles */
			
 
				-	data_movements_handles = malloc(world_size*sizeof(starpu_data_handle_t));
			
 
				+	_STARPU_MPI_MALLOC(data_movements_handles, world_size*sizeof(starpu_data_handle_t));
			
 
				 	for (i = 0; i < world_size; i++)
			
 
				 	{
			
 
				 		data_movements_data_register(&data_movements_handles[i], STARPU_MAIN_RAM, NULL, NULL, 0);
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -38,6 +38,9 @@
 
				 #include <core/simgrid.h>
			
 
				 #include <core/task.h>
			
 
				 
			
 
				+/* Number of ready requests to process before polling for completed requests */
			
 
				+#define NREADY_PROCESS 10
			
 
				+
			
 
				 static void _starpu_mpi_add_sync_point_in_fxt(void);
			
 
				 static void _starpu_mpi_submit_ready_request(void *arg);
			
 
				 static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req);
			
@@ -145,6 +148,8 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
				 	(*req)->early_data_handle = NULL;
			
 
				 	(*req)->envelope = NULL;
			
 
				 	(*req)->sequential_consistency = 1;
			
 
				+	(*req)->pre_sync_jobid = -1;
			
 
				+	(*req)->post_sync_jobid = -1;
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	starpu_pthread_queue_init(&((*req)->queue));
			
@@ -166,7 +171,6 @@ static void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
 
				 	starpu_pthread_queue_destroy(&req->queue);
			
 
				 #endif
			
 
				 	free(req);
			
 
				-	req = NULL;
			
 
				 }
			
 
				 
			
 
				  /********************************************************/
			
@@ -331,7 +335,8 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
				 
			
 
				 	if (_starpu_mpi_fake_world_size != -1)
			
 
				 	{
			
 
				-		starpu_data_acquire_cb_sequential_consistency(data_handle, mode, nop_acquire_cb, data_handle, sequential_consistency);
			
 
				+		/* Don't actually do the communication */
			
 
				+		starpu_data_acquire_on_node_cb_sequential_consistency(data_handle, STARPU_MAIN_RAM, mode, nop_acquire_cb, data_handle, sequential_consistency);
			
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
@@ -359,7 +364,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
				 	/* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				 	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
			
 
				 	 * the request is actually submitted */
			
 
				-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
			
 
				+	starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency, &req->pre_sync_jobid, &req->post_sync_jobid);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return req;
			
@@ -432,13 +437,13 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
				 
			
 
				 	if (req->sync == 0)
			
 
				 	{
			
 
				-		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				 		req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
			
 
				 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				 		req->ret = MPI_Issend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
			
 
				 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
			
 
				 	}
			
@@ -447,7 +452,7 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
				 	_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
			
 
				 #endif
			
 
				 
			
 
				-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle));
			
 
				+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
			
 
				 
			
 
				 	/* somebody is perhaps waiting for the MPI request to be posted */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
@@ -478,7 +483,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
				 		MPI_Type_size(req->datatype, &size);
			
 
				 		req->envelope->size = (starpu_ssize_t)req->count * size;
			
 
				 		_STARPU_MPI_DEBUG(20, "Post MPI isend count (%ld) datatype_size %ld request to %d\n",req->count,starpu_data_get_size(req->data_handle), req->node_tag.rank);
			
 
				-		_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
			
 
				 		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
			
 
				 	}
			
 
				 	else
			
@@ -493,7 +498,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
				  			// We already know the size of the data, let's send it to overlap with the packing of the data
			
 
				 			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
			
 
				 			req->count = req->envelope->size;
			
 
				-			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
			
 
				+			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
			
 
				 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
			
 
				 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
			
 
				  		}
			
@@ -504,7 +509,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
				  		{
			
 
				  			// We know the size now, let's send it
			
 
				 			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
			
 
				-			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
			
 
				+			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
			
 
				 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
			
 
				 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
			
 
				  		}
			
@@ -629,7 +634,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
				 		_envelope->mode = _STARPU_MPI_ENVELOPE_SYNC_READY;
			
 
				 		_envelope->data_tag = req->node_tag.data_tag;
			
 
				 		_STARPU_MPI_DEBUG(20, "Telling node %d it can send the data and waiting for the data back ...\n", req->node_tag.rank);
			
 
				-		_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_TO_DEBUG(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _envelope->data_tag, req->node_tag.comm);
			
 
				 		req->ret = MPI_Send(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
			
 
				 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Send returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
			
 
				 		free(_envelope);
			
@@ -638,12 +643,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
				 
			
 
				 	if (req->sync)
			
 
				 	{
			
 
				-		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
			
 
				 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
			
@@ -1063,6 +1068,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 				_starpu_mpi_datatype_free(req->data_handle, &req->datatype);
			
 
				 			}
			
 
				 		}
			
 
				+		_STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.rank, req->node_tag.data_tag);
			
 
				 	}
			
 
				 
			
 
				 	if (req->data_handle)
			
@@ -1166,6 +1172,8 @@ static void _starpu_mpi_test_detached_requests(void)
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
			
 
				 
			
 
				+		STARPU_MPI_ASSERT_MSG(req->data_request != MPI_REQUEST_NULL, "Cannot test completion of the request MPI_REQUEST_NULL");
			
 
				+
			
 
				 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
			
@@ -1281,21 +1289,28 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 
			
 
				 	// We wait until the request is pushed in the
			
 
				-	// ready_request list, that ensures that the next loop
			
 
				-	// will call _starpu_mpi_handle_ready_request
			
 
				-	// on the request and post the corresponding mpi_irecv,
			
 
				-	// otherwise, it may lead to read data as envelop
			
 
				+	// ready_request list
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
			
 
				 	while (!(early_data_handle->req->posted))
			
 
				 		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning check if req_ready is still necessary
			
 
				+#endif
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
			
 
				 	early_data_handle->req_ready = 1;
			
 
				 	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				+
			
 
				+	// Handle the request immediatly to make sure the mpi_irecv is
			
 
				+	// posted before receiving an other envelope
			
 
				+	_starpu_mpi_req_list_erase(ready_requests, early_data_handle->req);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				+	_starpu_mpi_handle_ready_request(early_data_handle->req);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 }
			
 
				 
			
 
				 static void *_starpu_mpi_progress_thread_func(void *arg)
			
@@ -1321,7 +1336,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
			
 
				 	/* And set TSD for us */
			
 
				 #ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
			
 
				-	smpi_process_set_user_data(calloc(MAX_TSD + 1, sizeof(void*)));
			
 
				+	void **tsd;
			
 
				+	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
			
 
				+	smpi_process_set_user_data(tsd);
			
 
				 #endif
			
 
				 #endif
			
 
				 
			
@@ -1386,9 +1403,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		}
			
 
				 
			
 
				 		/* get one request */
			
 
				+		int n = 0;
			
 
				 		while (!_starpu_mpi_req_list_empty(ready_requests))
			
 
				 		{
			
 
				 			struct _starpu_mpi_req *req;
			
 
				+
			
 
				+			if (n++ == NREADY_PROCESS)
			
 
				+				/* Already spent some time on submitting ready requests, poll before processing more ready requests */
			
 
				+				break;
			
 
				+
			
 
				 			req = _starpu_mpi_req_list_pop_back(ready_requests);
			
 
				 
			
 
				 			/* handling a request is likely to block for a while
			
@@ -1426,6 +1449,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 			if (flag)
			
 
				 			{
			
 
				+				_STARPU_MPI_COMM_FROM_DEBUG(envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, envelope_status.MPI_SOURCE, _STARPU_MPI_TAG_ENVELOPE, envelope->data_tag, envelope_comm);
			
 
				 				_STARPU_MPI_DEBUG(4, "Envelope received with mode %d\n", envelope->mode);
			
 
				 				if (envelope->mode == _STARPU_MPI_ENVELOPE_SYNC_READY)
			
 
				 				{
			
@@ -1618,7 +1642,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				         detached_requests = _starpu_mpi_req_list_new();
			
 
				 
			
 
				         STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
			
 
				-        _starpu_mpi_comm = starpu_getenv("STARPU_MPI_COMM") != NULL;
			
 
				+        _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
			
@@ -1712,6 +1736,7 @@ void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, in
 
				 	}
			
 
				 	if (rank != -1)
			
 
				 	{
			
 
				+		_STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
			
 
				 		mpi_data->node_tag.rank = rank;
			
 
				 		mpi_data->node_tag.comm = comm;
			
 
				 		_starpu_mpi_comm_register(comm);
			
@@ -1745,18 +1770,19 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
				 	int me, rank, tag;
			
 
				 
			
 
				 	rank = starpu_mpi_data_get_rank(data_handle);
			
 
				-	tag = starpu_mpi_data_get_tag(data_handle);
			
 
				 	if (rank == -1)
			
 
				 	{
			
 
				 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
			
 
				 	}
			
 
				+
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+	if (node == rank) return;
			
 
				+
			
 
				+	tag = starpu_mpi_data_get_tag(data_handle);
			
 
				 	if (tag == -1)
			
 
				 	{
			
 
				 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
			
 
				 	}
			
 
				-	starpu_mpi_comm_rank(comm, &me);
			
 
				-
			
 
				-	if (node == rank) return;
			
 
				 
			
 
				 	if (me == node)
			
 
				 	{
			
@@ -1785,18 +1811,19 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	int me, rank, tag;
			
 
				 
			
 
				 	rank = starpu_mpi_data_get_rank(data_handle);
			
 
				-	tag = starpu_mpi_data_get_tag(data_handle);
			
 
				 	if (rank == -1)
			
 
				 	{
			
 
				 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				 	}
			
 
				+
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+	if (node == rank) return;
			
 
				+
			
 
				+	tag = starpu_mpi_data_get_tag(data_handle);
			
 
				 	if (tag == -1)
			
 
				 	{
			
 
				 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 	}
			
 
				-	starpu_mpi_comm_rank(comm, &me);
			
 
				-
			
 
				-	if (node == rank) return;
			
 
				 
			
 
				 	if (me == node)
			
 
				 	{
			
--- a/mpi/src/starpu_mpi_cache.c
+++ b/mpi/src/starpu_mpi_cache.c
@@ -37,6 +37,8 @@ int _starpu_cache_enabled=1;
 
				 static MPI_Comm _starpu_cache_comm;
			
 
				 static int _starpu_cache_comm_size;
			
 
				 
			
 
				+static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle);
			
 
				+
			
 
				 int starpu_mpi_cache_is_enabled()
			
 
				 {
			
 
				 	return _starpu_cache_enabled==1;
			
@@ -101,18 +103,14 @@ void _starpu_mpi_cache_shutdown()
 
				 
			
 
				 void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	int i;
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	_starpu_mpi_cache_flush(data_handle);
			
 
				-	for(i=0 ; i<_starpu_cache_comm_size ; i++)
			
 
				-	{
			
 
				-		STARPU_PTHREAD_MUTEX_DESTROY(&mpi_data->cache_sent_mutex[i]);
			
 
				-	}
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	_starpu_mpi_cache_flush_nolock(data_handle);
			
 
				 	free(mpi_data->cache_sent);
			
 
				-	free(mpi_data->cache_sent_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
			
@@ -122,24 +120,22 @@ void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&mpi_data->cache_received_mutex, NULL);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	mpi_data->cache_received = 0;
			
 
				-	_STARPU_MALLOC(mpi_data->cache_sent_mutex, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent_mutex[0]));
			
 
				 	_STARPU_MALLOC(mpi_data->cache_sent, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent[0]));
			
 
				 	for(i=0 ; i<_starpu_cache_comm_size ; i++)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&mpi_data->cache_sent_mutex[i], NULL);
			
 
				 		mpi_data->cache_sent[i] = 0;
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_mpi_cache_data_add(starpu_data_handle_t data_handle)
			
 
				+static void _starpu_mpi_cache_data_add_nolock(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	struct _starpu_data_entry *entry;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	HASH_FIND_PTR(_cache_data, &data_handle, entry);
			
 
				 	if (entry == NULL)
			
 
				 	{
			
@@ -147,23 +143,20 @@ static void _starpu_mpi_cache_data_add(starpu_data_handle_t data_handle)
 
				 		entry->data_handle = data_handle;
			
 
				 		HASH_ADD_PTR(_cache_data, data_handle, entry);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_mpi_cache_data_remove(starpu_data_handle_t data_handle)
			
 
				+static void _starpu_mpi_cache_data_remove_nolock(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	struct _starpu_data_entry *entry;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	HASH_FIND_PTR(_cache_data, &data_handle, entry);
			
 
				 	if (entry)
			
 
				 	{
			
 
				 		HASH_DEL(_cache_data, entry);
			
 
				 		free(entry);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				 /**************************************
			
@@ -176,10 +169,10 @@ void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				 	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				 	if (mpi_data->cache_received == 1)
			
 
				 	{
			
 
				 #ifdef STARPU_DEVEL
			
@@ -188,10 +181,10 @@ void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
 
				 		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data_handle);
			
 
				 		mpi_data->cache_received = 0;
			
 
				 		starpu_data_invalidate_submit(data_handle);
			
 
				-		_starpu_mpi_cache_data_remove(data_handle);
			
 
				+		_starpu_mpi_cache_data_remove_nolock(data_handle);
			
 
				 		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				 int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
			
@@ -201,23 +194,23 @@ int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				 	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				 	int already_received = mpi_data->cache_received;
			
 
				 	if (already_received == 0)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been received by %d\n", data_handle, mpi_rank);
			
 
				 		mpi_data->cache_received = 1;
			
 
				-		_starpu_mpi_cache_data_add(data_handle);
			
 
				+		_starpu_mpi_cache_data_add_nolock(data_handle);
			
 
				 		_starpu_mpi_cache_stats_inc(mpi_rank, data_handle);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data_handle, mpi_rank);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 	return already_received;
			
 
				 }
			
 
				 
			
@@ -228,11 +221,10 @@ int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				-
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				 	already_received = mpi_data->cache_received;
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 	return already_received;
			
 
				 }
			
 
				 
			
@@ -251,18 +243,18 @@ void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	starpu_mpi_comm_size(mpi_data->node_tag.comm, &size);
			
 
				 	for(n=0 ; n<size ; n++)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[n]);
			
 
				 		if (mpi_data->cache_sent[n] == 1)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				 			mpi_data->cache_sent[n] = 0;
			
 
				-			_starpu_mpi_cache_data_remove(data_handle);
			
 
				+			_starpu_mpi_cache_data_remove_nolock(data_handle);
			
 
				 		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[n]);
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				 int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
			
@@ -273,19 +265,19 @@ int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
 
				 
			
 
				 	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	int already_sent = mpi_data->cache_sent[dest];
			
 
				 	if (mpi_data->cache_sent[dest] == 0)
			
 
				 	{
			
 
				 		mpi_data->cache_sent[dest] = 1;
			
 
				-		_starpu_mpi_cache_data_add(data_handle);
			
 
				+		_starpu_mpi_cache_data_add_nolock(data_handle);
			
 
				 		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data_handle, dest);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data_handle, dest);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 	return already_sent;
			
 
				 }
			
 
				 
			
@@ -296,11 +288,10 @@ int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				-
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				 	already_sent = mpi_data->cache_sent[dest];
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 	return already_sent;
			
 
				 }
			
 
				 
			
@@ -309,7 +300,7 @@ int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
 
				 	return _starpu_mpi_cache_sent_data_get(data_handle, dest);
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
			
 
				+static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 	int i, nb_nodes;
			
@@ -319,17 +310,14 @@ void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
 
				 	starpu_mpi_comm_size(mpi_data->node_tag.comm, &nb_nodes);
			
 
				 	for(i=0 ; i<nb_nodes ; i++)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[i]);
			
 
				 		if (mpi_data->cache_sent[i] == 1)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				 			mpi_data->cache_sent[i] = 0;
			
 
				 			_starpu_mpi_cache_stats_dec(i, data_handle);
			
 
				 		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[i]);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				 	if (mpi_data->cache_received == 1)
			
 
				 	{
			
 
				 		int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
@@ -337,14 +325,22 @@ void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
 
				 		mpi_data->cache_received = 0;
			
 
				 		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_mpi_cache_flush_and_invalidate(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	_starpu_mpi_cache_flush_nolock(data_handle);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				+}
			
 
				+
			
 
				+static void _starpu_mpi_cache_flush_and_invalidate_nolock(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int my_rank, mpi_rank;
			
 
				 
			
 
				-	_starpu_mpi_cache_flush(data_handle);
			
 
				+	_starpu_mpi_cache_flush_nolock(data_handle);
			
 
				 
			
 
				 	starpu_mpi_comm_rank(comm, &my_rank);
			
 
				 	mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
@@ -356,8 +352,10 @@ void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 
				 {
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	_starpu_mpi_cache_flush_and_invalidate(comm, data_handle);
			
 
				-	_starpu_mpi_cache_data_remove(data_handle);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	_starpu_mpi_cache_flush_and_invalidate_nolock(comm, data_handle);
			
 
				+	_starpu_mpi_cache_data_remove_nolock(data_handle);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				 void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
@@ -369,7 +367,7 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				 	HASH_ITER(hh, _cache_data, entry, tmp)
			
 
				 	{
			
 
				-		_starpu_mpi_cache_flush_and_invalidate(comm, entry->data_handle);
			
 
				+		_starpu_mpi_cache_flush_and_invalidate_nolock(comm, entry->data_handle);
			
 
				 		HASH_DEL(_cache_data, entry);
			
 
				 		free(entry);
			
 
				 	}
			
--- a/mpi/src/starpu_mpi_cache.h
+++ b/mpi/src/starpu_mpi_cache.h
@@ -24,7 +24,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 extern int _starpu_cache_enabled;
			
--- a/mpi/src/starpu_mpi_cache_stats.h
+++ b/mpi/src/starpu_mpi_cache_stats.h
@@ -22,7 +22,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_cache_stats_init();
			
--- a/mpi/src/starpu_mpi_comm.c
+++ b/mpi/src/starpu_mpi_comm.c
@@ -137,7 +137,7 @@ void _starpu_mpi_comm_post_recv()
 
				 		if (_comm->posted == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop on comm %d %ld\n", i, (long int)_comm->comm);
			
 
				-			_STARPU_MPI_COMM_FROM_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
			
 
				+			_STARPU_MPI_COMM_FROM_DEBUG(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
			
 
				 			MPI_Irecv(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm, &_comm->request);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			_starpu_mpi_simgrid_wait_req(&_comm->request, &_comm->status, &_comm->queue, &_comm->done);
			
--- a/mpi/src/starpu_mpi_comm.h
+++ b/mpi/src/starpu_mpi_comm.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_comm_init(MPI_Comm comm);
			
--- a/mpi/src/starpu_mpi_datatype.h
+++ b/mpi/src/starpu_mpi_datatype.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2011  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2013, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@
 
				 #include <starpu_mpi_private.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_datatype_init(void);
			
--- a/mpi/src/starpu_mpi_early_data.h
+++ b/mpi/src/starpu_mpi_early_data.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2014, 2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,7 +26,8 @@
 
				 #include <starpu_mpi_private.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 LIST_TYPE(_starpu_mpi_early_data_handle,
			
--- a/mpi/src/starpu_mpi_early_request.h
+++ b/mpi/src/starpu_mpi_early_request.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,7 +25,8 @@
 
				 #include <common/list.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_early_request_init(void);
			
--- a/mpi/src/starpu_mpi_fortran.c
+++ b/mpi/src/starpu_mpi_fortran.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  * Copyright (C) 2016  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -27,9 +27,12 @@ struct _starpu_mpi_argc_argv *fstarpu_mpi_argcv_alloc(int argc, int initialize_m
 
				 	struct _starpu_mpi_argc_argv *argcv;
			
 
				 	_STARPU_MPI_CALLOC(argcv, 1,sizeof(*argcv));
			
 
				 	argcv->initialize_mpi = initialize_mpi;
			
 
				-	if (comm_present) {
			
 
				+	if (comm_present)
			
 
				+	{
			
 
				 		argcv->comm = MPI_Comm_f2c(comm);
			
 
				-	} else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		argcv->comm = MPI_COMM_WORLD;
			
 
				 	}
			
 
				 	argcv->fargc = argc;
			
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,7 +23,8 @@
 
				 #include <common/fxt.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 #define _STARPU_MPI_FUT_START				0x5201
			
@@ -45,6 +46,9 @@ extern "C" {
 
				 #define _STARPU_MPI_FUT_UTESTING_END			0x5217
			
 
				 #define _STARPU_MPI_FUT_UWAIT_BEGIN			0x5218
			
 
				 #define _STARPU_MPI_FUT_UWAIT_END			0x5219
			
 
				+#define _STARPU_MPI_FUT_DATA_SET_RANK			0x521a
			
 
				+#define _STARPU_MPI_FUT_IRECV_TERMINATED		0x521b
			
 
				+#define _STARPU_MPI_FUT_ISEND_TERMINATED		0x521c
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 #define _STARPU_MPI_TRACE_START(rank, worldsize)	\
			
@@ -55,8 +59,8 @@ extern "C" {
 
				 	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
			
 
				 	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				-#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
			
 
				-	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size, jobid)	\
			
 
				+	FUT_DO_PROBE5(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), (jobid), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
			
 
				 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
			
@@ -73,6 +77,9 @@ extern "C" {
 
				 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_COMPLETE_END(type, rank, mpi_tag)		\
			
 
				 	if (type == RECV_REQ) { _STARPU_MPI_TRACE_IRECV_COMPLETE_END((rank), (mpi_tag)); } else if (type == SEND_REQ) { _STARPU_MPI_TRACE_ISEND_COMPLETE_END((rank), (mpi_tag), 0); }
			
 
				+#define _STARPU_MPI_TRACE_TERMINATED(req, rank, mpi_tag)		\
			
 
				+	if ((req)->request_type == RECV_REQ) FUT_DO_PROBE4(_STARPU_MPI_FUT_IRECV_TERMINATED, (rank), (mpi_tag), (req)->post_sync_jobid, _starpu_gettid()); else \
			
 
				+	if ((req)->request_type == SEND_REQ) FUT_DO_PROBE3(_STARPU_MPI_FUT_ISEND_TERMINATED, (rank), (mpi_tag), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
			
 
				 	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_SLEEP_END()	\
			
@@ -89,18 +96,21 @@ extern "C" {
 
				 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
			
 
				 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define _STARPU_MPI_TRACE_DATA_SET_RANK(handle, rank)	\
			
 
				+	FUT_DO_PROBE3(_STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
			
 
				 #define TRACE
			
 
				 #else
			
 
				 #define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
			
 
				-#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c, d)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_COMPLETE_BEGIN(a, b, c)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_COMPLETE_END(a, b, c)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_TERMINATED(a, b, c)			do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
			
@@ -112,6 +122,7 @@ extern "C" {
 
				 #define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
			
 
				 #define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
			
 
				+#define _STARPU_MPI_TRACE_DATA_SET_RANK(a, b)			do {} while(0);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/mpi/src/starpu_mpi_init.h
+++ b/mpi/src/starpu_mpi_init.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@
 
				 #include <starpu_mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_do_initialize(struct _starpu_mpi_argc_argv *argc_argv);
			
--- a/mpi/src/starpu_mpi_private.c
+++ b/mpi/src/starpu_mpi_private.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,7 +21,7 @@ int _starpu_debug_rank=-1;
 
				 int _starpu_debug_level_min=0;
			
 
				 int _starpu_debug_level_max=0;
			
 
				 int _starpu_mpi_tag = 42;
			
 
				-int _starpu_mpi_comm;
			
 
				+int _starpu_mpi_comm_debug;
			
 
				 
			
 
				 void _starpu_mpi_set_debug_level_min(int level)
			
 
				 {
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -27,7 +27,8 @@
 
				 #include <core/simgrid.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -48,7 +49,7 @@ void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, sta
 
				 
			
 
				 extern int _starpu_debug_rank;
			
 
				 char *_starpu_mpi_get_mpi_error_code(int code);
			
 
				-extern int _starpu_mpi_comm;
			
 
				+extern int _starpu_mpi_comm_debug;
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 extern int _starpu_debug_level_min;
			
@@ -93,24 +94,24 @@ int _starpu_debug_rank;
 
				 #define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				-#  define _STARPU_MPI_COMM_DEBUG(count, datatype, node, tag, utag, comm, way) \
			
 
				-	do \
			
 
				-	{ \
			
 
				-	     	if (_starpu_mpi_comm)	\
			
 
				-	     	{ \
			
 
				-     			int __size; \
			
 
				-			char _comm_name[128]; \
			
 
				-			int _comm_name_len; \
			
 
				-			int _rank; \
			
 
				+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
			
 
				+	do								\
			
 
				+	{							\
			
 
				+	     	if (_starpu_mpi_comm_debug)			\
			
 
				+		{					\
			
 
				+     			int __size;			\
			
 
				+			char _comm_name[128];		\
			
 
				+			int _comm_name_len;		\
			
 
				+			int _rank;			    \
			
 
				 			starpu_mpi_comm_rank(comm, &_rank); \
			
 
				-			MPI_Type_size(datatype, &__size); \
			
 
				+			MPI_Type_size(datatype, &__size);		\
			
 
				 			MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
			
 
				-			fprintf(stderr, "[%d][starpu_mpi] %s %d:%d(%d):%s %12s %ld     [%s:%d]\n", _rank, way, node, tag, utag, _comm_name, " ", count*__size, __starpu_func__ , __LINE__); \
			
 
				-			fflush(stderr); \
			
 
				-		} \
			
 
				+			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%d:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
			
 
				+			fflush(stderr);					\
			
 
				+		}							\
			
 
				 	} while(0);
			
 
				-#  define _STARPU_MPI_COMM_TO_DEBUG(count, datatype, dest, tag, utag, comm) 		_STARPU_MPI_COMM_DEBUG(count, datatype, dest, tag, utag, comm, "-->")
			
 
				-#  define _STARPU_MPI_COMM_FROM_DEBUG(count, datatype, source, tag, utag, comm) 	_STARPU_MPI_COMM_DEBUG(count, datatype, source, tag, utag, comm, "<--")
			
 
				+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) 	    _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
			
 
				+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
			
 
				 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
			
 
				 	do \
			
 
				 	{								\
			
@@ -122,9 +123,9 @@ int _starpu_debug_rank;
 
				 		}			\
			
 
				 	} while(0);
			
 
				 #else
			
 
				-#  define _STARPU_MPI_COMM_DEBUG(count, datatype, node, tag, utag, comm, way)		do { } while(0)
			
 
				-#  define _STARPU_MPI_COMM_TO_DEBUG(count, datatype, dest, tag, comm, utag)		do { } while(0)
			
 
				-#  define _STARPU_MPI_COMM_FROM_DEBUG(count, datatype, source, tag, comm, utag)	do { } while(0)
			
 
				+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
			
 
				+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
			
 
				+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) do { } while(0)
			
 
				 #  define _STARPU_MPI_DEBUG(level, fmt, ...)		do { } while(0)
			
 
				 #endif
			
 
				 
			
@@ -190,9 +191,7 @@ struct _starpu_mpi_data
 
				 {
			
 
				 	int magic;
			
 
				 	struct _starpu_mpi_node_tag node_tag;
			
 
				-	starpu_pthread_mutex_t *cache_sent_mutex;
			
 
				 	int *cache_sent;
			
 
				-	starpu_pthread_mutex_t cache_received_mutex;
			
 
				 	int cache_received;
			
 
				 };
			
 
				 
			
@@ -250,6 +249,9 @@ LIST_TYPE(_starpu_mpi_req,
 
				 
			
 
				 	int sequential_consistency;
			
 
				 
			
 
				+	long pre_sync_jobid;
			
 
				+	long post_sync_jobid;
			
 
				+
			
 
				      	UT_hash_handle hh;
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
--- a/mpi/src/starpu_mpi_select_node.h
+++ b/mpi/src/starpu_mpi_select_node.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2014, 2015  CNRS
			
 
				+ * Copyright (C) 2014, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 #define _STARPU_MPI_NODE_SELECTION_MAX_POLICY 24
			
--- a/mpi/src/starpu_mpi_stats.h
+++ b/mpi/src/starpu_mpi_stats.h
@@ -22,7 +22,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
			
--- a/mpi/src/starpu_mpi_sync_data.h
+++ b/mpi/src/starpu_mpi_sync_data.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,8 @@
 
				 #include <common/list.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_sync_data_init(void);
			
--- a/mpi/src/starpu_mpi_tag.h
+++ b/mpi/src/starpu_mpi_tag.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,8 @@
 
				 #include <mpi.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 void _starpu_mpi_tag_init(void);
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2011-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2011-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2014, 2016 Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -102,10 +102,6 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 		{
			
 
				 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				 		}
			
 
				-		if (data_tag == -1)
			
 
				-		{
			
 
				-			_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				-		}
			
 
				 
			
 
				 		if (do_execute && mpi_rank != me)
			
 
				 		{
			
@@ -113,6 +109,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 			int already_received = _starpu_mpi_cache_received_data_set(data);
			
 
				 			if (already_received == 0)
			
 
				 			{
			
 
				+				if (data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
			
 
				 				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
			
 
				 			}
			
@@ -125,6 +123,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
			
 
				 			if (already_sent == 0)
			
 
				 			{
			
 
				+				if (data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 				_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
			
 
				 				_SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
			
 
				 			}
			
@@ -144,20 +144,20 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
				 		{
			
 
				 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				 		}
			
 
				-		if(data_tag == -1)
			
 
				-		{
			
 
				-			_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				-		}
			
 
				 		if (mpi_rank == me)
			
 
				 		{
			
 
				 			if (xrank != -1 && me != xrank)
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
			
 
				+				if(data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 				starpu_mpi_irecv_detached(data, xrank, data_tag, comm, NULL, NULL);
			
 
				 			}
			
 
				 		}
			
 
				 		else if (do_execute)
			
 
				 		{
			
 
				+			if(data_tag == -1)
			
 
				+				_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 			_STARPU_MPI_DEBUG(1, "Send data %p back to its owner %d...\n", data, mpi_rank);
			
 
				 			_SEND_DATA(data, mode, mpi_rank, data_tag, comm, NULL, NULL);
			
 
				 		}
			
--- a/mpi/src/starpu_mpi_task_insert.h
+++ b/mpi/src/starpu_mpi_task_insert.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,7 +18,8 @@
 
				 #define __STARPU_MPI_TASK_INSERT_H__
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern "C" {
			
 
				+extern "C"
			
 
				+{
			
 
				 #endif
			
 
				 
			
 
				 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank);
			
--- a/mpi/tests/block_interface.c
+++ b/mpi/tests/block_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,7 +28,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/block_interface_pinned.c
+++ b/mpi/tests/block_interface_pinned.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,7 +28,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/datatypes.c
+++ b/mpi/tests/datatypes.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -333,7 +333,7 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	int error=0;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/early_request.c
+++ b/mpi/tests/early_request.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2015  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -191,7 +191,8 @@ int main(int argc, char * argv[])
 
				 	/* Init */
			
 
				 	int ret;
			
 
				 	int mpi_rank, mpi_size;
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &mpi_size);
			
 
				 
			
--- a/mpi/tests/gather.c
+++ b/mpi/tests/gather.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013, 2015  CNRS
			
 
				+ * Copyright (C) 2013, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,7 +23,7 @@ int main(int argc, char **argv)
 
				 	starpu_data_handle_t handle;
			
 
				 	int var;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/gather2.c
+++ b/mpi/tests/gather2.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013, 2015  CNRS
			
 
				+ * Copyright (C) 2013, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,7 +21,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/helper.h
+++ b/mpi/tests/helper.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,19 @@
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				-    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				-                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				-                                                fflush(ofile); }} while(0);
			
 
				+			int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank); \
			
 
				+			fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+			fflush(ofile); }} while(0);
			
 
				+
			
 
				+#define MPI_INIT_THREAD(argc, argv, required) do {	    \
			
 
				+		int thread_support;					\
			
 
				+		if (MPI_Init_thread(argc, argv, required, &thread_support) != MPI_SUCCESS) \
			
 
				+		{						\
			
 
				+			fprintf(stderr,"MPI_Init_thread failed\n");	\
			
 
				+			exit(1);					\
			
 
				+		}							\
			
 
				+		if (thread_support == MPI_THREAD_FUNNELED)		\
			
 
				+			fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); \
			
 
				+		if (thread_support < MPI_THREAD_FUNNELED)		\
			
 
				+			fprintf(stderr,"Warning: MPI does not have thread support!\n"); } while(0);
			
 
				+
			
--- a/mpi/tests/insert_task_compute.c
+++ b/mpi/tests/insert_task_compute.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -226,7 +226,7 @@ int main(int argc, char **argv)
 
				 	int after_node[2][4] = {{220, 20, 11, 22}, {220, 20, 11, 22}};
			
 
				 	int node, insert_task, data_array;
			
 
				 
			
 
				-	MPI_Init(&argc, &argv);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 
			
 
				 	global_ret = 0;
			
--- a/mpi/tests/insert_task_count.c
+++ b/mpi/tests/insert_task_count.c