5 年之前 · 536fe80f62
--- a/ChangeLog
+++ b/ChangeLog
@@ -42,6 +42,7 @@ New features:
 
				   * Add a task prefetch level, to improve retaining data in accelerators so we
			
 
				     can make prefetch more aggressive.
			
 
				   * Add starpu_data_dup_ro().
			
 
				+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
			
 
				 
			
 
				 Small changes:
			
 
				   * Add a synthetic energy efficiency testcase.
			
@@ -51,6 +52,7 @@ StarPU 1.3.5 (git revision xxx)
 
				 
			
 
				 Small changes:
			
 
				   * Move MPI cache functions into the public API
			
 
				+  * Add STARPU_MPI_NOBIND environment variable.
			
 
				 
			
 
				 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
			
 
				 ====================================================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 
				 	], [simgrid_lib_dir=no])
			
 
				 
			
 
				 if test x$enable_simgrid = xyes ; then
			
 
				+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
			
 
				+
			
 
				    	if test -n "$SIMGRID_CFLAGS" ; then
			
 
				 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
			
 
				 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
			
@@ -836,7 +838,10 @@ if test x"$enable_native_winthreads" = xyes ; then
 
				 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
			
 
				 		AC_MSG_ERROR([pthread_create unavailable]))
			
 
				 else
			
 
				-    AC_CHECK_LIB([pthread], [pthread_create])
			
 
				+    AC_CHECK_LIB([pthread], [pthread_create], [
			
 
				+        LIBS="$LIBS -lpthread"
			
 
				+        STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lpthread"
			
 
				+    ])
			
 
				 fi
			
 
				 
			
 
				 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
			
@@ -957,8 +962,8 @@ AC_CHECK_FUNCS([mkdtemp])
 
				 
			
 
				 AC_CHECK_FUNCS([pread pwrite])
			
 
				 
			
 
				-AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
			
 
				-                    enable_hdf5=$enableval, enable_hdf5=maybe)
			
 
				+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--enable-hdf5], [disable HDF5 support])],
			
 
				+                    enable_hdf5=$enableval, enable_hdf5=no)
			
 
				 
			
 
				 if test "x$enable_hdf5" != xno ; then
			
 
				 	AC_ARG_WITH(hdf5-include-dir,
			
@@ -1017,8 +1022,11 @@ fi
 
				 
			
 
				 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
			
 
				         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
			
 
				+	enable_hdf5=yes
			
 
				+else
			
 
				+	enable_hdf5=no
			
 
				 fi
			
 
				-AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
			
 
				+AM_CONDITIONAL(STARPU_HAVE_HDF5, test "x$enable_hdf5" = "xyes")
			
 
				 
			
 
				 
			
 
				 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
			
@@ -2523,8 +2531,8 @@ AC_SUBST(STARPU_EXPORT_DYNAMIC)
 
				 # Computes the maximum number of different kernels a message-passing sink
			
 
				 # can lookup for and launch.
			
 
				 AC_MSG_CHECKING(Maximum number of message-passing kernels)
			
 
				-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
			
 
				-	      -enable-maxmpkernels=<number>],
			
 
				+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING(
			
 
				+	      [-enable-maxmpkernels=<number>],
			
 
				 	      [maximum number of kernels a message-passing sink can lookup
			
 
				 	      for and execute])],
			
 
				 	      maxmpkernels=$enableval, maxmpkernels=10)
			
@@ -3111,6 +3119,9 @@ fi
 
				 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
			
 
				 			[Disable multiple linear regression models])],
			
 
				 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
			
 
				+AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
			
 
				+			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
			
 
				+			enable_mlr_blas=$enableval, enable_mlr_blas=no)
			
 
				 
			
 
				 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
			
 
				 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
			
@@ -3121,11 +3132,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 
				 	if test x$blas_lib = xnone ; then
			
 
				 	   use_system_lapack=no
			
 
				 	fi
			
 
				-	if test x$use_system_lapack = xyes; then
			
 
				+	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
			
 
				 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
			
 
				 		LDFLAGS="-llapack $LDFLAGS"
			
 
				 	else
			
 
				-		if test x$blas_lib = xmkl; then
			
 
				+		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
			
 
				 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
			
 
				 		else
			
 
				 			AC_MSG_CHECKING(whether min-dgels is linked)
			
@@ -3142,9 +3153,6 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 
				 					install_min_dgels=no
			
 
				 					support_mlr=no
			
 
				 				else
			
 
				-					if test ! -d $PWD/min-dgels; then
			
 
				-						cp -r $srcdir/min-dgels $PWD/
			
 
				-					fi
			
 
				 					AC_MSG_RESULT(yes)
			
 
				 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
			
 
				 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
			
@@ -3590,7 +3598,7 @@ if test "x$enable_shared" = xno; then
 
				         # No .so, so application will unexpected have to know which -l to
			
 
				         # use. Give them in .pc file.
			
 
				 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
			
 
				-	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
			
 
				+	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
			
 
				 fi
			
 
				 AC_SUBST(STARPU_EXPORTED_LIBS)
			
 
				 
			
@@ -3631,6 +3639,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
			
 
				+  test -e tests/microbenchs/bandwidth_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/bandwidth_scheds.sh tests/microbenchs/
			
 
				   mkdir -p tests/energy
			
 
				   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
			
 
				   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
			
@@ -3806,6 +3815,7 @@ AC_MSG_NOTICE([
 
				                Scheduler Hypervisor:                          $build_sc_hypervisor
			
 
				                simgrid enabled:                               $enable_simgrid
			
 
				                ayudame enabled:                               $ayu_msg
			
 
				+               HDF5 enabled:                                  $enable_hdf5
			
 
				 	       Native fortran support:                        $enable_build_fortran
			
 
				 	       Native MPI fortran support:                    $use_mpi_fort
			
 
				 	       Support for multiple linear regression models: $support_mlr
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -352,7 +352,20 @@ use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necess
 
				 has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
			
 
				 made on each architecture, to avoid bad scheduling decisions just because the
			
 
				-first measurements were not so good. Details on the current performance model status
			
 
				+first measurements were not so good.
			
 
				+
			
 
				+Note that StarPU will not record the very first measurement for a given codelet
			
 
				+and a given size, because it would most often be hit by computation library
			
 
				+loading or initialization. StarPU will also throw measurements away if it
			
 
				+notices that after computing an average execution time, it notices that most
			
 
				+subsequent tasks have an execution time largely outside the computed average
			
 
				+("Too big deviation for model..." warning messages). By looking at the details
			
 
				+of the message and their reported measurements, it can highlight that your
			
 
				+computation library really has non-stable measurements, which is probably an
			
 
				+indication of an issue in the computation library, or the execution environment
			
 
				+(e.g. rogue daemons).
			
 
				+
			
 
				+Details on the current performance model status
			
 
				 can be obtained with the tool <c>starpu_perfmodel_display</c>: the
			
 
				 option <c>-l</c> lists the available performance models, and the
			
 
				 option <c>-s</c> allows to choose the performance model to be
			
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -375,7 +375,11 @@ parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
 
				 directory. These files are reused when \ref STARPU_CALIBRATE
			
 
				 environment variable is set to <c>1</c>, to recompute coefficients
			
 
				 based on the current, but also on the previous
			
 
				-executions. Additionally, when multiple linear regression models are
			
 
				+executions. By default StarPU uses a lightweight dgels implementation, but the
			
 
				+\ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
			
 
				+used to make StarPU use a system-provided dgels BLAS.
			
 
				+
			
 
				+Additionally, when multiple linear regression models are
			
 
				 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
			
 
				 <c>model->combinations</c> are not defined, StarPU will still write
			
 
				 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow
			
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -167,8 +167,12 @@ theory results), see the \ref STARPU_SIMGRID_TRANSFER_COST, \ref STARPU_SIMGRID_
 
				 
			
 
				 \section SimulationMPIApplications MPI Applications
			
 
				 
			
 
				-StarPU-MPI applications can also be run in SimGrid mode. It needs to be compiled
			
 
				-with \c smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
			
 
				+StarPU-MPI applications can also be run in SimGrid mode. smpi currently requires
			
 
				+that StarPU be build statically only, so <c>--disable-shared</c> needs to be
			
 
				+passed to <c>./configure</c>.
			
 
				+
			
 
				+The application needs to be compiled with \c smpicc, and run using the
			
 
				+<c>starpu_smpirun</c> script, for instance:
			
 
				 
			
 
				 \verbatim
			
 
				 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
			
@@ -182,6 +186,13 @@ in case of a heterogeneous platform, it is possible to use the
 
				 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
			
 
				 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
			
 
				 
			
 
				+So as to use FxT traces, libfxt also needs to be built statically, <b>and</b>
			
 
				+with dynamic linking flags, i.e. with
			
 
				+
			
 
				+\verbatim
			
 
				+CFLAGS=-fPIC ./configure --enable-static
			
 
				+\endverbatim
			
 
				+
			
 
				 \section SimulationDebuggingApplications Debugging Applications
			
 
				 
			
 
				 By default, SimGrid uses its own implementation of threads, which prevents \c gdb
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -255,6 +255,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 
				 workers.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_MPI_NOBIND</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_MPI_NOBIND
			
 
				+\addindex __env__STARPU_MPI_NOBIND
			
 
				+Setting it to non-zero will prevent StarPU from binding the MPI to
			
 
				+a separate core. This is for instance useful when running the testsuite on a single system.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_WORKERS_CUDAID</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_WORKERS_CUDAID
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -571,11 +571,11 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 
				 Enable linking with LevelDB if available
			
 
				 </dd>
			
 
				 
			
 
				-<dt>--disable-hdf5</dt>
			
 
				+<dt>--enable-hdf5</dt>
			
 
				 <dd>
			
 
				-\anchor disable-hdf5
			
 
				-\addindex __configure__--disable-hdf5
			
 
				-Disable building HDF5 support.
			
 
				+\anchor enable-hdf5
			
 
				+\addindex __configure__--enable-hdf5
			
 
				+Enable building HDF5 support.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--with-hdf5-include-dir=<c>path</c></dt>
			
@@ -768,6 +768,14 @@ this parameter is 10. Experimental.
 
				 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-mlr-system-blas</dt>
			
 
				+<dd>
			
 
				+\anchor enable-mlr-system-blas
			
 
				+\addindex __configure__--enable-mlr-system-blas
			
 
				+Allow to make multiple linear regression models use the system-provided BLAS for dgels
			
 
				+(see \ref PerformanceModelExample)
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 */
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 
				 	api/csr_data_interface			\
			
 
				 	api/matrix_data_interface		\
			
 
				 	api/multiformat_data_interface		\
			
 
				+	api/tensor_data_interface		\
			
 
				 	api/variable_data_interface		\
			
 
				 	api/vector_data_interface		\
			
 
				 	api/void_data_interface
			
--- a/examples/api/bcsr_data_interface.c
+++ b/examples/api/bcsr_data_interface.c
@@ -17,6 +17,17 @@
 
				 // This program checks that the implementation of the BCSR data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
			
 
				+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
			
 
				+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
			
 
				+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
			
 
				+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
			
 
				+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
			
 
				+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
			
 
				+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
			
 
				+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
			
 
				+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
			
 
				+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
			
 
				 #include "../../src/datawizard/interfaces/bcsr_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/block_data_interface.c
+++ b/examples/api/block_data_interface.c
@@ -17,6 +17,16 @@
 
				 // This program checks that the implementation of the block data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_block_ops my_starpu_interface_block_ops
			
 
				+#define starpu_block_data_register my_starpu_block_data_register
			
 
				+#define starpu_block_ptr_register my_starpu_block_ptr_register
			
 
				+#define starpu_block_get_nx my_starpu_block_get_nx
			
 
				+#define starpu_block_get_ny my_starpu_block_get_ny
			
 
				+#define starpu_block_get_nz my_starpu_block_get_nz
			
 
				+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
			
 
				+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
			
 
				+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
			
 
				+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
			
 
				 #include "../../src/datawizard/interfaces/block_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/coo_data_interface.c
+++ b/examples/api/coo_data_interface.c
@@ -17,6 +17,8 @@
 
				 // This program checks that the implementation of the COO data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
			
 
				+#define starpu_coo_data_register my_starpu_coo_data_register
			
 
				 #include "../../src/datawizard/interfaces/coo_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/csr_data_interface.c
+++ b/examples/api/csr_data_interface.c
@@ -17,6 +17,15 @@
 
				 // This program checks that the implementation of the CSR data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
			
 
				+#define starpu_csr_data_register my_starpu_csr_data_register
			
 
				+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
			
 
				+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
			
 
				+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
			
 
				+#define starpu_csr_get_elemsize my_starpu_csr_get_elemsize
			
 
				+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
			
 
				+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
			
 
				+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
			
 
				 #include "../../src/datawizard/interfaces/csr_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/matrix_data_interface.c
+++ b/examples/api/matrix_data_interface.c
@@ -17,6 +17,16 @@
 
				 // This program checks that the implementation of the matrix data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
			
 
				+#define starpu_matrix_data_register my_starpu_matrix_data_register
			
 
				+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
			
 
				+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
			
 
				+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
			
 
				+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
			
 
				+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
			
 
				+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
			
 
				+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
			
 
				+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
			
 
				 #include "../../src/datawizard/interfaces/matrix_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/multiformat_data_interface.c
+++ b/examples/api/multiformat_data_interface.c
@@ -17,6 +17,8 @@
 
				 // This program checks that the implementation of the multiformat data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
			
 
				+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
			
 
				 #include "../../src/datawizard/interfaces/multiformat_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/tensor_data_interface.c
+++ b/examples/api/tensor_data_interface.c
@@ -0,0 +1,37 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+// This program checks that the implementation of the tensor data
			
 
				+// interface only uses StarPU's public API
			
 
				+
			
 
				+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
			
 
				+#define starpu_tensor_data_register my_starpu_tensor_data_register
			
 
				+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
			
 
				+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
			
 
				+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
			
 
				+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
			
 
				+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
			
 
				+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
			
 
				+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
			
 
				+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
			
 
				+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
			
 
				+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
			
 
				+#include "../../src/datawizard/interfaces/tensor_interface.c"
			
 
				+
			
 
				+int main()
			
 
				+{
			
 
				+        return 0;
			
 
				+}
			
--- a/examples/api/variable_data_interface.c
+++ b/examples/api/variable_data_interface.c
@@ -17,6 +17,11 @@
 
				 // This program checks that the implementation of the variable data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
			
 
				+#define starpu_variable_data_register my_starpu_variable_data_register
			
 
				+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
			
 
				+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
			
 
				+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
			
 
				 #include "../../src/datawizard/interfaces/variable_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/vector_data_interface.c
+++ b/examples/api/vector_data_interface.c
@@ -17,6 +17,14 @@
 
				 // This program checks that the implementation of the vector data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
			
 
				+#define starpu_vector_data_register my_starpu_vector_data_register
			
 
				+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
			
 
				+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
			
 
				+#define starpu_vector_get_nx my_starpu_vector_get_nx
			
 
				+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
			
 
				+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
			
 
				+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
			
 
				 #include "../../src/datawizard/interfaces/vector_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/api/void_data_interface.c
+++ b/examples/api/void_data_interface.c
@@ -17,6 +17,8 @@
 
				 // This program checks that the implementation of the void data
			
 
				 // interface only uses StarPU's public API
			
 
				 
			
 
				+#define starpu_interface_void_ops my_starpu_interface_void_ops
			
 
				+#define starpu_void_data_register my_starpu_void_data_register
			
 
				 #include "../../src/datawizard/interfaces/void_interface.c"
			
 
				 
			
 
				 int main()
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -92,29 +92,26 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 		}
			
 
				 		starpu_data_wont_use(sdatakk);
			
 
				 
			
 
				-		for (m = k+1; m<nblocks; m++)
			
 
				+		for (n = k+1; n<nblocks; n++)
			
 
				 		{
			
 
				-                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
			
 
				-			for (n = k+1; n<nblocks; n++)
			
 
				+                        starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
			
 
				+			for (m = n; m<nblocks; m++)
			
 
				 			{
			
 
				-				if (n <= m)
			
 
				-                                {
			
 
				-					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
			
 
				-					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
			
 
				-
			
 
				-					ret = starpu_task_insert(&cl22,
			
 
				-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-								 STARPU_R, sdatamk,
			
 
				-								 STARPU_R, sdatank,
			
 
				-								 cl22.modes[2], sdatamn,
			
 
				-								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				-								 STARPU_TAG_ONLY, TAG22(k,m,n),
			
 
				-								 0);
			
 
				-					if (ret == -ENODEV) return 77;
			
 
				-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				-                                }
			
 
				+				starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
			
 
				+				starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
			
 
				+
			
 
				+				ret = starpu_task_insert(&cl22,
			
 
				+							 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							 STARPU_R, sdatamk,
			
 
				+							 STARPU_R, sdatank,
			
 
				+							 cl22.modes[2], sdatamn,
			
 
				+							 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				+							 STARPU_TAG_ONLY, TAG22(k,m,n),
			
 
				+							 0);
			
 
				+				if (ret == -ENODEV) return 77;
			
 
				+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				 			}
			
 
				-			starpu_data_wont_use(sdatamk);
			
 
				+			starpu_data_wont_use(sdatank);
			
 
				 		}
			
 
				 		starpu_iteration_pop();
			
 
				 	}
			
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -169,6 +169,11 @@ int main(void)
 
				         print_data(handle);
			
 
				         starpu_data_unregister(handle);
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				+#endif
			
 
				+
			
 
				         /* Print result block */
			
 
				         FPRINTF(stderr, "OUT Block\n");
			
 
				         print_block(block, NX, NY, NZ, NX, NX*NY);
			
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 
				 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
			
 
				 
			
 
				 	{
			
 
				-		size_t global=nx*ny*nz;
			
 
				-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
			
 
				+		size_t global[3]={nx,ny,nz};
			
 
				+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 	}
			
 
				 	starpu_opencl_release_kernel(kernel);
			
--- a/examples/filters/fblock_opencl_kernel.cl
+++ b/examples/filters/fblock_opencl_kernel.cl
@@ -18,14 +18,17 @@
 
				 
			
 
				 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
			
 
				 {
			
 
				-        int i, j, k;
			
 
				-        block = (__global char *)block + offset;
			
 
				-        for(k=0; k<nz ; k++)
			
 
				-	{
			
 
				-                for(j=0; j<ny ; j++)
			
 
				-		{
			
 
				-                        for(i=0; i<nx ; i++)
			
 
				-                                block[(k*ldz)+(j*ldy)+i] = factor;
			
 
				-                }
			
 
				-        }
			
 
				+	const int idx = get_global_id(0);
			
 
				+	const int idy = get_global_id(1);
			
 
				+	const int idz = get_global_id(2);
			
 
				+	if (idx >= nx)
			
 
				+		return;
			
 
				+	if (idy >= ny)
			
 
				+		return;
			
 
				+	if (idz >= nz)
			
 
				+		return;
			
 
				+
			
 
				+	block = (__global int*) ((__global char *)block + offset);
			
 
				+	int i = idz*ldz + idy*ldy + idx;
			
 
				+	block[i] = factor;
			
 
				 }
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
				 
			
 
				 /**
			
 
				    Similar to starpu_data_release(), except that the data
			
 
				-   will be available on the given memory \p node instead of main memory.
			
 
				+   was made available on the given memory \p node instead of main memory.
			
 
				    The \p node parameter must be exactly the same as the corresponding \c
			
 
				    starpu_data_acquire_on_node* call.
			
 
				 */
			
 
				 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
			
 
				 
			
 
				+/**
			
 
				+   Partly release the piece of data acquired by the application either by
			
 
				+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
			
 
				+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
			
 
				+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
			
 
				+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
			
 
				+   case this is equivalent to calling starpu_data_release().
			
 
				+*/
			
 
				+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
			
 
				+
			
 
				+/**
			
 
				+   Similar to starpu_data_release_to(), except that the data
			
 
				+   was made available on the given memory \p node instead of main memory.
			
 
				+   The \p node parameter must be exactly the same as the corresponding \c
			
 
				+   starpu_data_acquire_on_node* call.
			
 
				+*/
			
 
				+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 /**
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -69,42 +69,36 @@ struct starpu_fxt_options
 
				 	char *number_events_path;
			
 
				 	char *anim_path;
			
 
				 	char *states_path;
			
 
				+	char worker_names[STARPU_NMAXWORKERS][256];
			
 
				+	int nworkers;
			
 
				+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	/**
			
 
				 	   In case we are going to gather multiple traces (e.g in the case of
			
 
				 	   MPI processes), we may need to prefix the name of the containers.
			
 
				 	*/
			
 
				 	char *file_prefix;
			
 
				+
			
 
				 	/**
			
 
				 	   In case we are going to gather multiple traces (e.g in the case of
			
 
				-	   MPI processes), we may need to prefix the name of the containers.
			
 
				+	   MPI processes), this variable stores the time offset with the rank 0.
			
 
				 	*/
			
 
				 	uint64_t file_offset;
			
 
				+
			
 
				 	/**
			
 
				 	   In case we are going to gather multiple traces (e.g in the case of
			
 
				-	   MPI processes), we may need to prefix the name of the containers.
			
 
				+	   MPI processes), this variable stores the MPI rank of the trace file.
			
 
				 	*/
			
 
				 	int file_rank;
			
 
				 
			
 
				 	/**
			
 
				-	   Output parameters
			
 
				-	*/
			
 
				-	char worker_names[STARPU_NMAXWORKERS][256];
			
 
				-	/**
			
 
				-	   Output parameters
			
 
				-	*/
			
 
				-	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
			
 
				-	/**
			
 
				-	   Output parameters
			
 
				-	*/
			
 
				-	int nworkers;
			
 
				-
			
 
				-	/**
			
 
				 	   In case we want to dump the list of codelets to an external tool
			
 
				 	*/
			
 
				 	struct starpu_fxt_codelet_event **dumped_codelets;
			
 
				+
			
 
				 	/**
			
 
				-	   In case we want to dump the list of codelets to an external tool
			
 
				+	   In case we want to dump the list of codelets to an external tool, number
			
 
				+	   of dumped codelets.
			
 
				 	*/
			
 
				 	long dumped_codelets_count;
			
 
				 };
			
--- a/min-dgels/Makefile.in
+++ b/min-dgels/Makefile.in
@@ -7,13 +7,17 @@ ADDITIONAL=additional
 
				 
			
 
				 all:
			
 
				 	mkdir -p build
			
 
				-	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
			
 
				+	[ -d "$(CLAPACK)" ] || ( cp -a $(srcdir)/$(CLAPACK) . ; chmod -R +rwX $(CLAPACK) )
			
 
				 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
			
 
				 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
			
 
				-	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
			
 
				+	[ -d "$(ADDITIONAL)" ] || ( cp -a $(srcdir)/$(ADDITIONAL) . ; chmod -R +rwX $(ADDITIONAL) )
			
 
				 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
			
 
				 
			
 
				 install:
			
 
				+installcheck:
			
 
				+uninstall:
			
 
				+distuninstallcheck:
			
 
				+dvi:
			
 
				 
			
 
				 clean:
			
 
				 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
			
@@ -21,6 +25,7 @@ clean:
 
				 	rm -rf build *~
			
 
				 
			
 
				 distclean: clean
			
 
				+	[ -f Makefile.in ] || rm -fr $(CLAPACK) $(ADDITIONAL)
			
 
				 
			
 
				 # This part is needed by StarPU
			
 
				 
			
--- a/min-dgels/additional/blaswrap.h
+++ b/min-dgels/additional/blaswrap.h
@@ -5,156 +5,4 @@
 
				 #ifndef __BLASWRAP_H
			
 
				 #define __BLASWRAP_H
			
 
				 
			
 
				-#ifndef NO_BLAS_WRAP
			
 
				- 
			
 
				-/* BLAS1 routines */
			
 
				-#define _starpu_srotg_ f2c_srotg
			
 
				-#define _starpu_crotg_ f2c_crotg
			
 
				-#define _starpu_drotg_ f2c_drotg
			
 
				-#define _starpu_zrotg_ f2c_zrotg
			
 
				-#define _starpu_srotmg_ f2c_srotmg
			
 
				-#define _starpu_drotmg_ f2c_drotmg
			
 
				-#define _starpu_srot_ f2c_srot
			
 
				-#define _starpu_drot_ f2c_drot
			
 
				-#define _starpu_srotm_ f2c_srotm
			
 
				-#define _starpu_drotm_ f2c_drotm
			
 
				-#define _starpu_sswap_ f2c_sswap
			
 
				-#define _starpu_dswap_ f2c_dswap
			
 
				-#define _starpu_cswap_ f2c_cswap
			
 
				-#define _starpu_zswap_ f2c_zswap
			
 
				-#define _starpu_sscal_ f2c_sscal
			
 
				-#define _starpu_dscal_ f2c_dscal
			
 
				-#define _starpu_cscal_ f2c_cscal
			
 
				-#define _starpu_zscal_ f2c_zscal
			
 
				-#define _starpu_csscal_ f2c_csscal
			
 
				-#define _starpu_zdscal_ f2c_zdscal
			
 
				-#define _starpu_scopy_ f2c_scopy
			
 
				-#define _starpu_dcopy_ f2c_dcopy
			
 
				-#define _starpu_ccopy_ f2c_ccopy
			
 
				-#define _starpu_zcopy_ f2c_zcopy
			
 
				-#define _starpu_saxpy_ f2c_saxpy
			
 
				-#define _starpu_daxpy_ f2c_daxpy
			
 
				-#define _starpu_caxpy_ f2c_caxpy
			
 
				-#define _starpu_zaxpy_ f2c_zaxpy
			
 
				-#define _starpu_sdot_ f2c_sdot
			
 
				-#define _starpu_ddot_ f2c_ddot
			
 
				-#define _starpu_cdotu_ f2c_cdotu
			
 
				-#define _starpu_zdotu_ f2c_zdotu
			
 
				-#define _starpu_cdotc_ f2c_cdotc
			
 
				-#define _starpu_zdotc_ f2c_zdotc
			
 
				-#define _starpu_snrm2_ f2c_snrm2
			
 
				-#define _starpu_dnrm2_ f2c_dnrm2
			
 
				-#define _starpu_scnrm2_ f2c_scnrm2
			
 
				-#define _starpu_dznrm2_ f2c_dznrm2
			
 
				-#define _starpu_sasum_ f2c_sasum
			
 
				-#define _starpu_dasum_ f2c_dasum
			
 
				-#define _starpu_scasum_ f2c_scasum
			
 
				-#define _starpu_dzasum_ f2c_dzasum
			
 
				-#define _starpu_isamax_ f2c_isamax
			
 
				-#define _starpu_idamax_ f2c_idamax
			
 
				-#define _starpu_icamax_ f2c_icamax
			
 
				-#define _starpu_izamax_ f2c_izamax
			
 
				- 
			
 
				-/* BLAS2 routines */
			
 
				-#define _starpu_sgemv_ f2c_sgemv
			
 
				-#define _starpu_dgemv_ f2c_dgemv
			
 
				-#define _starpu_cgemv_ f2c_cgemv
			
 
				-#define _starpu_zgemv_ f2c_zgemv
			
 
				-#define _starpu_sgbmv_ f2c_sgbmv
			
 
				-#define _starpu_dgbmv_ f2c_dgbmv
			
 
				-#define _starpu_cgbmv_ f2c_cgbmv
			
 
				-#define _starpu_zgbmv_ f2c_zgbmv
			
 
				-#define _starpu_chemv_ f2c_chemv
			
 
				-#define _starpu_zhemv_ f2c_zhemv
			
 
				-#define _starpu_chbmv_ f2c_chbmv
			
 
				-#define _starpu_zhbmv_ f2c_zhbmv
			
 
				-#define _starpu_chpmv_ f2c_chpmv
			
 
				-#define _starpu_zhpmv_ f2c_zhpmv
			
 
				-#define _starpu_ssymv_ f2c_ssymv
			
 
				-#define _starpu_dsymv_ f2c_dsymv
			
 
				-#define _starpu_ssbmv_ f2c_ssbmv
			
 
				-#define _starpu_dsbmv_ f2c_dsbmv
			
 
				-#define _starpu_sspmv_ f2c_sspmv
			
 
				-#define _starpu_dspmv_ f2c_dspmv
			
 
				-#define _starpu_strmv_ f2c_strmv
			
 
				-#define _starpu_dtrmv_ f2c_dtrmv
			
 
				-#define _starpu_ctrmv_ f2c_ctrmv
			
 
				-#define _starpu_ztrmv_ f2c_ztrmv
			
 
				-#define _starpu_stbmv_ f2c_stbmv
			
 
				-#define _starpu_dtbmv_ f2c_dtbmv
			
 
				-#define _starpu_ctbmv_ f2c_ctbmv
			
 
				-#define _starpu_ztbmv_ f2c_ztbmv
			
 
				-#define _starpu_stpmv_ f2c_stpmv
			
 
				-#define _starpu_dtpmv_ f2c_dtpmv
			
 
				-#define _starpu_ctpmv_ f2c_ctpmv
			
 
				-#define _starpu_ztpmv_ f2c_ztpmv
			
 
				-#define _starpu_strsv_ f2c_strsv
			
 
				-#define _starpu_dtrsv_ f2c_dtrsv
			
 
				-#define _starpu_ctrsv_ f2c_ctrsv
			
 
				-#define _starpu_ztrsv_ f2c_ztrsv
			
 
				-#define _starpu_stbsv_ f2c_stbsv
			
 
				-#define _starpu_dtbsv_ f2c_dtbsv
			
 
				-#define _starpu_ctbsv_ f2c_ctbsv
			
 
				-#define _starpu_ztbsv_ f2c_ztbsv
			
 
				-#define _starpu_stpsv_ f2c_stpsv
			
 
				-#define _starpu_dtpsv_ f2c_dtpsv
			
 
				-#define _starpu_ctpsv_ f2c_ctpsv
			
 
				-#define _starpu_ztpsv_ f2c_ztpsv
			
 
				-#define _starpu_sger_ f2c_sger
			
 
				-#define _starpu_dger_ f2c_dger
			
 
				-#define _starpu_cgeru_ f2c_cgeru
			
 
				-#define _starpu_zgeru_ f2c_zgeru
			
 
				-#define _starpu_cgerc_ f2c_cgerc
			
 
				-#define _starpu_zgerc_ f2c_zgerc
			
 
				-#define _starpu_cher_ f2c_cher
			
 
				-#define _starpu_zher_ f2c_zher
			
 
				-#define _starpu_chpr_ f2c_chpr
			
 
				-#define _starpu_zhpr_ f2c_zhpr
			
 
				-#define _starpu_cher2_ f2c_cher2
			
 
				-#define _starpu_zher2_ f2c_zher2
			
 
				-#define _starpu_chpr2_ f2c_chpr2
			
 
				-#define _starpu_zhpr2_ f2c_zhpr2
			
 
				-#define _starpu_ssyr_ f2c_ssyr
			
 
				-#define _starpu_dsyr_ f2c_dsyr
			
 
				-#define _starpu_sspr_ f2c_sspr
			
 
				-#define _starpu_dspr_ f2c_dspr
			
 
				-#define _starpu_ssyr2_ f2c_ssyr2
			
 
				-#define _starpu_dsyr2_ f2c_dsyr2
			
 
				-#define _starpu_sspr2_ f2c_sspr2
			
 
				-#define _starpu_dspr2_ f2c_dspr2
			
 
				- 
			
 
				-/* BLAS3 routines */
			
 
				-#define _starpu_sgemm_ f2c_sgemm
			
 
				-#define _starpu_dgemm_ f2c_dgemm
			
 
				-#define _starpu_cgemm_ f2c_cgemm
			
 
				-#define _starpu_zgemm_ f2c_zgemm
			
 
				-#define _starpu_ssymm_ f2c_ssymm
			
 
				-#define _starpu_dsymm_ f2c_dsymm
			
 
				-#define _starpu_csymm_ f2c_csymm
			
 
				-#define _starpu_zsymm_ f2c_zsymm
			
 
				-#define _starpu_chemm_ f2c_chemm
			
 
				-#define _starpu_zhemm_ f2c_zhemm
			
 
				-#define _starpu_ssyrk_ f2c_ssyrk
			
 
				-#define _starpu_dsyrk_ f2c_dsyrk
			
 
				-#define _starpu_csyrk_ f2c_csyrk
			
 
				-#define _starpu_zsyrk_ f2c_zsyrk
			
 
				-#define _starpu_cherk_ f2c_cherk
			
 
				-#define _starpu_zherk_ f2c_zherk
			
 
				-#define _starpu_ssyr2k_ f2c_ssyr2k
			
 
				-#define _starpu_dsyr2k_ f2c_dsyr2k
			
 
				-#define _starpu_csyr2k_ f2c_csyr2k
			
 
				-#define _starpu_zsyr2k_ f2c_zsyr2k
			
 
				-#define _starpu_cher2k_ f2c_cher2k
			
 
				-#define _starpu_zher2k_ f2c_zher2k
			
 
				-#define _starpu_strmm_ f2c_strmm
			
 
				-#define _starpu_dtrmm_ f2c_dtrmm
			
 
				-#define _starpu_ctrmm_ f2c_ctrmm
			
 
				-#define _starpu_ztrmm_ f2c_ztrmm
			
 
				-#define _starpu_strsm_ f2c_strsm
			
 
				-#define _starpu_dtrsm_ f2c_dtrsm
			
 
				-#define _starpu_ctrsm_ f2c_ctrsm
			
 
				-#define _starpu_ztrsm_ f2c_ztrsm
			
 
				-
			
 
				-#endif /* NO_BLAS_WRAP */
			
 
				-
			
 
				 #endif /* __BLASWRAP_H */
			
--- a/min-dgels/base/F2CLIBS/libf2c/Makefile
+++ b/min-dgels/base/F2CLIBS/libf2c/Makefile
@@ -175,6 +175,14 @@ xwsne.o:	fio.h
 
				 xwsne.o:	lio.h
			
 
				 xwsne.o:	fmt.h
			
 
				 
			
 
				+main.o:		signal1.h
			
 
				+signal_.o:	signal1.h
			
 
				+s_paus.o:	signal1.h
			
 
				+
			
 
				+err.o:		sysdep1.h
			
 
				+fio.h:		sysdep1.h
			
 
				+util.c:		sysdep1.h
			
 
				+
			
 
				 arith.h: arithchk.c
			
 
				 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
			
 
				 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm
			
--- a/min-dgels/base/INCLUDE/blaswrap.h
+++ b/min-dgels/base/INCLUDE/blaswrap.h
@@ -5,156 +5,4 @@
 
				 #ifndef __BLASWRAP_H
			
 
				 #define __BLASWRAP_H
			
 
				 
			
 
				-#ifndef NO_BLAS_WRAP
			
 
				- 
			
 
				-/* BLAS1 routines */
			
 
				-#define _starpu_srotg_ f2c_srotg
			
 
				-#define _starpu_crotg_ f2c_crotg
			
 
				-#define _starpu_drotg_ f2c_drotg
			
 
				-#define _starpu_zrotg_ f2c_zrotg
			
 
				-#define _starpu_srotmg_ f2c_srotmg
			
 
				-#define _starpu_drotmg_ f2c_drotmg
			
 
				-#define _starpu_srot_ f2c_srot
			
 
				-#define _starpu_drot_ f2c_drot
			
 
				-#define _starpu_srotm_ f2c_srotm
			
 
				-#define _starpu_drotm_ f2c_drotm
			
 
				-#define _starpu_sswap_ f2c_sswap
			
 
				-#define _starpu_dswap_ f2c_dswap
			
 
				-#define _starpu_cswap_ f2c_cswap
			
 
				-#define _starpu_zswap_ f2c_zswap
			
 
				-#define _starpu_sscal_ f2c_sscal
			
 
				-#define _starpu_dscal_ f2c_dscal
			
 
				-#define _starpu_cscal_ f2c_cscal
			
 
				-#define _starpu_zscal_ f2c_zscal
			
 
				-#define _starpu_csscal_ f2c_csscal
			
 
				-#define _starpu_zdscal_ f2c_zdscal
			
 
				-#define _starpu_scopy_ f2c_scopy
			
 
				-#define _starpu_dcopy_ f2c_dcopy
			
 
				-#define _starpu_ccopy_ f2c_ccopy
			
 
				-#define _starpu_zcopy_ f2c_zcopy
			
 
				-#define _starpu_saxpy_ f2c_saxpy
			
 
				-#define _starpu_daxpy_ f2c_daxpy
			
 
				-#define _starpu_caxpy_ f2c_caxpy
			
 
				-#define _starpu_zaxpy_ f2c_zaxpy
			
 
				-#define _starpu_sdot_ f2c_sdot
			
 
				-#define _starpu_ddot_ f2c_ddot
			
 
				-#define _starpu_cdotu_ f2c_cdotu
			
 
				-#define _starpu_zdotu_ f2c_zdotu
			
 
				-#define _starpu_cdotc_ f2c_cdotc
			
 
				-#define _starpu_zdotc_ f2c_zdotc
			
 
				-#define _starpu_snrm2_ f2c_snrm2
			
 
				-#define _starpu_dnrm2_ f2c_dnrm2
			
 
				-#define _starpu_scnrm2_ f2c_scnrm2
			
 
				-#define _starpu_dznrm2_ f2c_dznrm2
			
 
				-#define _starpu_sasum_ f2c_sasum
			
 
				-#define _starpu_dasum_ f2c_dasum
			
 
				-#define _starpu_scasum_ f2c_scasum
			
 
				-#define _starpu_dzasum_ f2c_dzasum
			
 
				-#define _starpu_isamax_ f2c_isamax
			
 
				-#define _starpu_idamax_ f2c_idamax
			
 
				-#define _starpu_icamax_ f2c_icamax
			
 
				-#define _starpu_izamax_ f2c_izamax
			
 
				- 
			
 
				-/* BLAS2 routines */
			
 
				-#define _starpu_sgemv_ f2c_sgemv
			
 
				-#define _starpu_dgemv_ f2c_dgemv
			
 
				-#define _starpu_cgemv_ f2c_cgemv
			
 
				-#define _starpu_zgemv_ f2c_zgemv
			
 
				-#define _starpu_sgbmv_ f2c_sgbmv
			
 
				-#define _starpu_dgbmv_ f2c_dgbmv
			
 
				-#define _starpu_cgbmv_ f2c_cgbmv
			
 
				-#define _starpu_zgbmv_ f2c_zgbmv
			
 
				-#define _starpu_chemv_ f2c_chemv
			
 
				-#define _starpu_zhemv_ f2c_zhemv
			
 
				-#define _starpu_chbmv_ f2c_chbmv
			
 
				-#define _starpu_zhbmv_ f2c_zhbmv
			
 
				-#define _starpu_chpmv_ f2c_chpmv
			
 
				-#define _starpu_zhpmv_ f2c_zhpmv
			
 
				-#define _starpu_ssymv_ f2c_ssymv
			
 
				-#define _starpu_dsymv_ f2c_dsymv
			
 
				-#define _starpu_ssbmv_ f2c_ssbmv
			
 
				-#define _starpu_dsbmv_ f2c_dsbmv
			
 
				-#define _starpu_sspmv_ f2c_sspmv
			
 
				-#define _starpu_dspmv_ f2c_dspmv
			
 
				-#define _starpu_strmv_ f2c_strmv
			
 
				-#define _starpu_dtrmv_ f2c_dtrmv
			
 
				-#define _starpu_ctrmv_ f2c_ctrmv
			
 
				-#define _starpu_ztrmv_ f2c_ztrmv
			
 
				-#define _starpu_stbmv_ f2c_stbmv
			
 
				-#define _starpu_dtbmv_ f2c_dtbmv
			
 
				-#define _starpu_ctbmv_ f2c_ctbmv
			
 
				-#define _starpu_ztbmv_ f2c_ztbmv
			
 
				-#define _starpu_stpmv_ f2c_stpmv
			
 
				-#define _starpu_dtpmv_ f2c_dtpmv
			
 
				-#define _starpu_ctpmv_ f2c_ctpmv
			
 
				-#define _starpu_ztpmv_ f2c_ztpmv
			
 
				-#define _starpu_strsv_ f2c_strsv
			
 
				-#define _starpu_dtrsv_ f2c_dtrsv
			
 
				-#define _starpu_ctrsv_ f2c_ctrsv
			
 
				-#define _starpu_ztrsv_ f2c_ztrsv
			
 
				-#define _starpu_stbsv_ f2c_stbsv
			
 
				-#define _starpu_dtbsv_ f2c_dtbsv
			
 
				-#define _starpu_ctbsv_ f2c_ctbsv
			
 
				-#define _starpu_ztbsv_ f2c_ztbsv
			
 
				-#define _starpu_stpsv_ f2c_stpsv
			
 
				-#define _starpu_dtpsv_ f2c_dtpsv
			
 
				-#define _starpu_ctpsv_ f2c_ctpsv
			
 
				-#define _starpu_ztpsv_ f2c_ztpsv
			
 
				-#define _starpu_sger_ f2c_sger
			
 
				-#define _starpu_dger_ f2c_dger
			
 
				-#define _starpu_cgeru_ f2c_cgeru
			
 
				-#define _starpu_zgeru_ f2c_zgeru
			
 
				-#define _starpu_cgerc_ f2c_cgerc
			
 
				-#define _starpu_zgerc_ f2c_zgerc
			
 
				-#define _starpu_cher_ f2c_cher
			
 
				-#define _starpu_zher_ f2c_zher
			
 
				-#define _starpu_chpr_ f2c_chpr
			
 
				-#define _starpu_zhpr_ f2c_zhpr
			
 
				-#define _starpu_cher2_ f2c_cher2
			
 
				-#define _starpu_zher2_ f2c_zher2
			
 
				-#define _starpu_chpr2_ f2c_chpr2
			
 
				-#define _starpu_zhpr2_ f2c_zhpr2
			
 
				-#define _starpu_ssyr_ f2c_ssyr
			
 
				-#define _starpu_dsyr_ f2c_dsyr
			
 
				-#define _starpu_sspr_ f2c_sspr
			
 
				-#define _starpu_dspr_ f2c_dspr
			
 
				-#define _starpu_ssyr2_ f2c_ssyr2
			
 
				-#define _starpu_dsyr2_ f2c_dsyr2
			
 
				-#define _starpu_sspr2_ f2c_sspr2
			
 
				-#define _starpu_dspr2_ f2c_dspr2
			
 
				- 
			
 
				-/* BLAS3 routines */
			
 
				-#define _starpu_sgemm_ f2c_sgemm
			
 
				-#define _starpu_dgemm_ f2c_dgemm
			
 
				-#define _starpu_cgemm_ f2c_cgemm
			
 
				-#define _starpu_zgemm_ f2c_zgemm
			
 
				-#define _starpu_ssymm_ f2c_ssymm
			
 
				-#define _starpu_dsymm_ f2c_dsymm
			
 
				-#define _starpu_csymm_ f2c_csymm
			
 
				-#define _starpu_zsymm_ f2c_zsymm
			
 
				-#define _starpu_chemm_ f2c_chemm
			
 
				-#define _starpu_zhemm_ f2c_zhemm
			
 
				-#define _starpu_ssyrk_ f2c_ssyrk
			
 
				-#define _starpu_dsyrk_ f2c_dsyrk
			
 
				-#define _starpu_csyrk_ f2c_csyrk
			
 
				-#define _starpu_zsyrk_ f2c_zsyrk
			
 
				-#define _starpu_cherk_ f2c_cherk
			
 
				-#define _starpu_zherk_ f2c_zherk
			
 
				-#define _starpu_ssyr2k_ f2c_ssyr2k
			
 
				-#define _starpu_dsyr2k_ f2c_dsyr2k
			
 
				-#define _starpu_csyr2k_ f2c_csyr2k
			
 
				-#define _starpu_zsyr2k_ f2c_zsyr2k
			
 
				-#define _starpu_cher2k_ f2c_cher2k
			
 
				-#define _starpu_zher2k_ f2c_zher2k
			
 
				-#define _starpu_strmm_ f2c_strmm
			
 
				-#define _starpu_dtrmm_ f2c_dtrmm
			
 
				-#define _starpu_ctrmm_ f2c_ctrmm
			
 
				-#define _starpu_ztrmm_ f2c_ztrmm
			
 
				-#define _starpu_strsm_ f2c_strsm
			
 
				-#define _starpu_dtrsm_ f2c_dtrsm
			
 
				-#define _starpu_ctrsm_ f2c_ctrsm
			
 
				-#define _starpu_ztrsm_ f2c_ztrsm
			
 
				-
			
 
				-#endif /* NO_BLAS_WRAP */
			
 
				-
			
 
				 #endif /* __BLASWRAP_H */
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -47,10 +47,10 @@ endif
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 if STARPU_MPI_CHECK
			
@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 
				 if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES	+=	\
			
 
				 	mpi_lu/plu_implicit_example_float	\
			
 
				-	mpi_lu/plu_implicit_example_double
			
 
				+	mpi_lu/plu_implicit_example_double	\
			
 
				+	mpi_lu/plu_outofcore_example_float	\
			
 
				+	mpi_lu/plu_outofcore_example_double
			
 
				 endif
			
 
				 
			
 
				 mpi_lu_plu_example_float_LDADD =	\
			
--- a/mpi/examples/benchs/abstract_sendrecv_bench.c
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.c
@@ -22,25 +22,30 @@
 
				 void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
			
 
				 {
			
 
				 	uint64_t iterations = LOOPS_DEFAULT;
			
 
				+	uint64_t s = 0;
			
 
				+	uint64_t j = 0;
			
 
				+	uint64_t k = 0;
			
 
				 
			
 
				 	if (mpi_rank >= 2)
			
 
				 	{
			
 
				+		starpu_pause();
			
 
				 		if (thread_barrier != NULL)
			
 
				 		{
			
 
				 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
			
 
				 		}
			
 
				 
			
 
				-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+		for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				 		{
			
 
				 			iterations = bench_nb_iterations(iterations, s);
			
 
				 
			
 
				 			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				-			for (uint64_t j = 0; j < iterations; j++)
			
 
				+			for (j = 0; j < iterations; j++)
			
 
				 			{
			
 
				 				starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_resume();
			
 
				 
			
 
				 		return;
			
 
				 	}
			
@@ -64,7 +69,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
				 	}
			
 
				 
			
 
				 	global_tstart = starpu_timing_now();
			
 
				-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				 	{
			
 
				 		vector_send = malloc(s);
			
 
				 		vector_recv = malloc(s);
			
@@ -78,7 +83,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
				 
			
 
				 		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				-		for (uint64_t j = 0; j < iterations; j++)
			
 
				+		for (j = 0; j < iterations; j++)
			
 
				 		{
			
 
				 			if (mpi_rank == 0)
			
 
				 			{
			
@@ -111,7 +116,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
				 			const double d9_lat = lats[9 * (iterations - 1) / 10];
			
 
				 			double avg_lat = 0.0;
			
 
				 
			
 
				-			for(uint64_t k = 0; k < iterations; k++)
			
 
				+			for(k = 0; k < iterations; k++)
			
 
				 			{
			
 
				 				avg_lat += lats[k];
			
 
				 			}
			
--- a/mpi/examples/benchs/burst_helper.c
+++ b/mpi/examples/benchs/burst_helper.c
@@ -45,7 +45,8 @@ void burst_init_data(int rank)
 
				 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
			
 
				 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
			
 
				 
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		int i = 0;
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
			
 
				 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
			
@@ -62,7 +63,8 @@ void burst_free_data(int rank)
 
				 {
			
 
				 	if (rank == 0 || rank == 1)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		int i = 0;
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			starpu_data_unregister(send_handles[i]);
			
 
				 			free(send_buffers[i]);
			
@@ -84,12 +86,13 @@ void burst_free_data(int rank)
 
				 void burst_bidir(int rank)
			
 
				 {
			
 
				 	int other_rank = (rank == 0) ? 1 : 0;
			
 
				+	int i = 0;
			
 
				 
			
 
				 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
			
 
				 
			
 
				 	if (rank == 0 || rank == 1)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			recv_reqs[i] = NULL;
			
 
				 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
			
@@ -100,13 +103,13 @@ void burst_bidir(int rank)
 
				 
			
 
				 	if (rank == 0 || rank == 1)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			send_reqs[i] = NULL;
			
 
				 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
 
				 		}
			
 
				 
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
@@ -120,10 +123,11 @@ void burst_bidir(int rank)
 
				 void burst_unidir(int sender, int receiver, int rank)
			
 
				 {
			
 
				 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
			
 
				+	int i = 0;
			
 
				 
			
 
				 	if (rank == receiver)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			recv_reqs[i] = NULL;
			
 
				 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
			
@@ -134,7 +138,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
				 
			
 
				 	if (rank == sender)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			send_reqs[i] = NULL;
			
 
				 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
			
@@ -143,7 +147,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
				 
			
 
				 	if (rank == sender || rank == receiver)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
@@ -160,12 +164,13 @@ void burst_bidir_half_postponed(int rank)
 
				 {
			
 
				 	int other_rank = (rank == 0) ? 1 : 0;
			
 
				 	int received = 0;
			
 
				+	int i = 0;
			
 
				 
			
 
				 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
			
 
				 
			
 
				 	if (rank == 0 || rank == 1)
			
 
				 	{
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			recv_reqs[i] = NULL;
			
 
				 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
			
@@ -176,7 +181,7 @@ void burst_bidir_half_postponed(int rank)
 
				 
			
 
				 	if (rank == 0 || rank == 1)
			
 
				 	{
			
 
				-		for (int i = 0; i < (burst_nb_requests / 2); i++)
			
 
				+		for (i = 0; i < (burst_nb_requests / 2); i++)
			
 
				 		{
			
 
				 			send_reqs[i] = NULL;
			
 
				 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
@@ -184,13 +189,13 @@ void burst_bidir_half_postponed(int rank)
 
				 
			
 
				 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
			
 
				 
			
 
				-		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
			
 
				+		for (i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			send_reqs[i] = NULL;
			
 
				 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
			
 
				 		}
			
 
				 
			
 
				-		for (int i = 0; i < burst_nb_requests; i++)
			
 
				+		for (i = 0; i < burst_nb_requests; i++)
			
 
				 		{
			
 
				 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
			
 
				 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
			
--- a/mpi/examples/benchs/gemm_helper.c
+++ b/mpi/examples/benchs/gemm_helper.c
@@ -98,8 +98,9 @@ static void cpu_init_matrix_random(void *descr[], void *arg)
 
				 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	unsigned i = 0;
			
 
				 
			
 
				-	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	for (i = 0; i < nx *ny; i++)
			
 
				 	{
			
 
				 		subA[i] = (TYPE) (starpu_drand48());
			
 
				 		subB[i] = (TYPE) (starpu_drand48());
			
@@ -113,8 +114,9 @@ static void cpu_init_matrix_zero(void *descr[], void *arg)
 
				 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	unsigned i = 0;
			
 
				 
			
 
				-	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	for (i = 0; i < nx *ny; i++)
			
 
				 	{
			
 
				 		subA[i] = (TYPE) (0);
			
 
				 	}
			
@@ -290,18 +292,21 @@ void gemm_add_polling_dependencies()
 
				 {
			
 
				 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
			
 
				 	unsigned nb_workers = starpu_worker_get_count();
			
 
				+	starpu_tag_t synchro_tag = 0;
			
 
				+	starpu_tag_t previous_tag = 0;
			
 
				+	starpu_tag_t next_tag = 0;
			
 
				 
			
 
				-	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
			
 
				+	for (synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
			
 
				 	{
			
 
				 		// this synchro tag depends on tasks of previous column of tasks:
			
 
				-		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
			
 
				+		for (previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
			
 
				 		{
			
 
				 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
			
 
				 		}
			
 
				 
			
 
				 		// tasks of the next column of tasks depend on this synchro tag:
			
 
				 		// this actually allows workers to poll for new tasks, while no task is available
			
 
				-		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
			
 
				+		for (next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
			
 
				 		{
			
 
				 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
			
 
				 		}
			
--- a/mpi/examples/benchs/sendrecv_bench.c
+++ b/mpi/examples/benchs/sendrecv_bench.c
@@ -27,9 +27,10 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, worldsize;
			
 
				 	int pause_workers = 0;
			
 
				+	int i = 0;
			
 
				 
			
 
				 
			
 
				-	for (int i = 1; i < argc; i++)
			
 
				+	for (i = 1; i < argc; i++)
			
 
				 	{
			
 
				 		if (strcmp(argv[i], "-p") == 0)
			
 
				 		{
			
--- a/mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
+++ b/mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
@@ -56,6 +56,8 @@ void cpu_task(void* descr[], void* args)
 
				 	double t1, t2;
			
 
				 	int asked_worker;
			
 
				 	int current_worker = starpu_worker_get_id();
			
 
				+	uint64_t j = 0;
			
 
				+	uint64_t k = 0;
			
 
				 
			
 
				 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
			
 
				 
			
@@ -64,7 +66,7 @@ void cpu_task(void* descr[], void* args)
 
				 	iterations = bench_nb_iterations(iterations, s);
			
 
				 	double* lats = malloc(sizeof(double) * iterations);
			
 
				 
			
 
				-	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
			
 
				+	for (j = 0; j < NB_WARMUP_PINGPONGS; j++)
			
 
				 	{
			
 
				 		if (mpi_rank == 0)
			
 
				 		{
			
@@ -78,7 +80,7 @@ void cpu_task(void* descr[], void* args)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for (uint64_t j = 0; j < iterations; j++)
			
 
				+	for (j = 0; j < iterations; j++)
			
 
				 	{
			
 
				 		if (mpi_rank == 0)
			
 
				 		{
			
@@ -107,7 +109,7 @@ void cpu_task(void* descr[], void* args)
 
				 		const double d9_lat = lats[9 * (iterations - 1) / 10];
			
 
				 		double avg_lat = 0.0;
			
 
				 
			
 
				-		for(uint64_t k = 0; k < iterations; k++)
			
 
				+		for(k = 0; k < iterations; k++)
			
 
				 		{
			
 
				 			avg_lat += lats[k];
			
 
				 		}
			
@@ -167,6 +169,8 @@ int main(int argc, char **argv)
 
				 	unsigned cpu_count = starpu_cpu_worker_get_count();
			
 
				 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
			
 
				 	unsigned tag = 0;
			
 
				+	uint64_t s = 0;
			
 
				+	unsigned i = 0;
			
 
				 
			
 
				 	int* workers = malloc(cpu_count * sizeof(int));
			
 
				 	float** vectors_send = malloc(cpu_count * sizeof(float*));
			
@@ -174,11 +178,11 @@ int main(int argc, char **argv)
 
				 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
			
 
				 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
			
 
				 
			
 
				-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				 	{
			
 
				 		starpu_pause();
			
 
				 
			
 
				-		for (unsigned i = 0; i < cpu_count; i++)
			
 
				+		for (i = 0; i < cpu_count; i++)
			
 
				 		{
			
 
				 			workers[i] = i;
			
 
				 			vectors_send[i] = malloc(s);
			
@@ -201,7 +205,7 @@ int main(int argc, char **argv)
 
				 		starpu_resume();
			
 
				 		starpu_task_wait_for_all();
			
 
				 
			
 
				-		for (unsigned i = 0; i < cpu_count; i++)
			
 
				+		for (i = 0; i < cpu_count; i++)
			
 
				 		{
			
 
				 			starpu_data_unregister(handles_send[i]);
			
 
				 			starpu_data_unregister(handles_recv[i]);
			
--- a/mpi/examples/filters/filter.c
+++ b/mpi/examples/filters/filter.c
@@ -59,7 +59,7 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
 
				 
			
 
				 	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
			
 
				 	STARPU_ASSERT(nchunks == 2);
			
 
				-	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%d is not a multiple of nchunks %d\n", nx, nchunks);
			
 
				+	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%u is not a multiple of nchunks %u\n", nx, nchunks);
			
 
				 
			
 
				 	vector_child->id = vector_father->id;
			
 
				 	vector_child->nx = nx/2;
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -20,6 +20,64 @@
 
				 #include <limits.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				+/* This is from magma
			
 
				+
			
 
				+  -- Innovative Computing Laboratory
			
 
				+  -- Electrical Engineering and Computer Science Department
			
 
				+  -- University of Tennessee
			
 
				+  -- (C) Copyright 2009
			
 
				+
			
 
				+  Redistribution  and  use  in  source and binary forms, with or without
			
 
				+  modification,  are  permitted  provided  that the following conditions
			
 
				+  are met:
			
 
				+
			
 
				+  * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+    notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+  * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+    notice,  this list of conditions and the following disclaimer in the
			
 
				+    documentation  and/or other materials provided with the distribution.
			
 
				+  * Neither  the  name of the University of Tennessee, Knoxville nor the
			
 
				+    names of its contributors may be used to endorse or promote products
			
 
				+    derived from this software without specific prior written permission.
			
 
				+
			
 
				+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+  */
			
 
				+
			
 
				+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
			
 
				+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
			
 
				+
			
 
				+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
			
 
				+
			
 
				+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
			
 
				+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
			
 
				+
			
 
				+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
			
 
				+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
			
 
				+
			
 
				+#define FMULS_TRSM FMULS_TRMM
			
 
				+#define FADDS_TRSM FMULS_TRMM
			
 
				+
			
 
				+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
			
 
				+
			
 
				+
			
 
				+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+
			
 
				+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
			
 
				+
			
 
				+/* End of magma code */
			
 
				+
			
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
				 {
			
 
				 	unsigned k, m, n;
			
 
				 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+	unsigned nn = size/nblocks;
			
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
				 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				 				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 				       STARPU_RW, data_handles[k][k],
			
 
				+				       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 				       0);
			
 
				 
			
 
				 		for (m = k+1; m<nblocks; m++)
			
@@ -88,28 +148,30 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
				 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 					       STARPU_R, data_handles[k][k],
			
 
				 					       STARPU_RW, data_handles[m][k],
			
 
				+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 					       0);
			
 
				 
			
 
				 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				 			if (my_distrib(k, k, nodes) == rank)
			
 
				 				starpu_data_wont_use(data_handles[k][k]);
			
 
				+		}
			
 
				 
			
 
				-			for (n = k+1; n<nblocks; n++)
			
 
				+		for (n = k+1; n<nblocks; n++)
			
 
				+		{
			
 
				+			for (m = n; m<nblocks; m++)
			
 
				 			{
			
 
				-				if (n <= m)
			
 
				-				{
			
 
				-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-							       STARPU_R, data_handles[n][k],
			
 
				-							       STARPU_R, data_handles[m][k],
			
 
				-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				-							       0);
			
 
				-				}
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				+						       0);
			
 
				 			}
			
 
				 
			
 
				-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				-			if (my_distrib(m, k, nodes) == rank)
			
 
				-				starpu_data_wont_use(data_handles[m][k]);
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
			
 
				+			if (my_distrib(n, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[n][k]);
			
 
				 		}
			
 
				 		starpu_iteration_pop();
			
 
				 	}
			
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
				 {
			
 
				 	unsigned k, m, n;
			
 
				 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+	unsigned nn = size/nblocks;
			
 
				 
			
 
				 	/* Column */
			
 
				 	for (n = 0; n<nblocks; n++)
			
@@ -137,7 +200,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
				 						       STARPU_R, data_handles[n][k],
			
 
				 						       STARPU_R, data_handles[m][k],
			
 
				 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 						       0);
			
 
				+
			
 
				+				if (m == n)
			
 
				+				{
			
 
				+					/* Nobody else will need it */
			
 
				+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				+					starpu_data_wont_use(data_handles[m][k]);
			
 
				+				}
			
 
				 			}
			
 
				 			k = n;
			
 
				 			if (m > n)
			
@@ -147,6 +218,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 						       STARPU_R, data_handles[k][k],
			
 
				 						       STARPU_RW, data_handles[m][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 						       0);
			
 
				 			}
			
 
				 			else
			
@@ -155,26 +227,27 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
				 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 						       STARPU_RW, data_handles[k][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 						       0);
			
 
				 			}
			
 
				+
			
 
				 		}
			
 
				 
			
 
				+		/* We won't need it any more */
			
 
				+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
			
 
				+		starpu_data_wont_use(data_handles[n][n]);
			
 
				+
			
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				-
			
 
				-	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				-	for (m = 0; m < nblocks; m++)
			
 
				-		for (n = 0; n < nblocks ; n++)
			
 
				-			starpu_data_wont_use(data_handles[m][n]);
			
 
				 }
			
 
				 
			
 
				 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
			
 
				 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				 {
			
 
				-	unsigned a, c;
			
 
				+	unsigned a;
			
 
				 	unsigned k, m, n;
			
 
				 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+	unsigned nn = size/nblocks;
			
 
				 
			
 
				 	/* double-antidiagonal number:
			
 
				 	 * - a=0 contains (0,0) plus (1,0)
			
@@ -205,7 +278,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
				 						       STARPU_R, data_handles[n][k],
			
 
				 						       STARPU_R, data_handles[m][k],
			
 
				 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 						       0);
			
 
				+
			
 
				+				if (m == nblocks-1)
			
 
				+				{
			
 
				+					/* Nobody else will need it */
			
 
				+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
			
 
				+					starpu_data_wont_use(data_handles[n][k]);
			
 
				+				}
			
 
				 			}
			
 
				 
			
 
				 			/* k = n */
			
@@ -216,6 +297,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 						       STARPU_R, data_handles[k][k],
			
 
				 						       STARPU_RW, data_handles[m][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 						       0);
			
 
				 			}
			
 
				 			else
			
@@ -224,8 +306,16 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
				 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 						       STARPU_RW, data_handles[k][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 						       0);
			
 
				 			}
			
 
				+
			
 
				+			if (m == nblocks - 1)
			
 
				+			{
			
 
				+				/* We do not need the potrf result any more */
			
 
				+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
			
 
				+				starpu_data_wont_use(data_handles[n][n]);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		/* column within second antidiagonal for a */
			
@@ -246,7 +336,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
				 						       STARPU_R, data_handles[n][k],
			
 
				 						       STARPU_R, data_handles[m][k],
			
 
				 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 						       0);
			
 
				+
			
 
				+				if (m == nblocks-1)
			
 
				+				{
			
 
				+					/* Nobody else will need it */
			
 
				+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
			
 
				+					starpu_data_wont_use(data_handles[n][k]);
			
 
				+				}
			
 
				 			}
			
 
				 			/* non-diagonal block, solve */
			
 
				 			k = n;
			
@@ -254,17 +352,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
				 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 					       STARPU_R, data_handles[k][k],
			
 
				 					       STARPU_RW, data_handles[m][k],
			
 
				+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 					       0);
			
 
				+
			
 
				+			if (m == nblocks - 1)
			
 
				+			{
			
 
				+				/* We do not need the potrf result any more */
			
 
				+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
			
 
				+				starpu_data_wont_use(data_handles[n][n]);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				-
			
 
				-	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				-	for (m = 0; m < nblocks; m++)
			
 
				-		for (n = 0; n < nblocks ; n++)
			
 
				-			starpu_data_wont_use(data_handles[m][n]);
			
 
				 }
			
 
				 
			
 
				 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
			
@@ -273,9 +373,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
				 	unsigned a;
			
 
				 	int k, m, n;
			
 
				 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+	unsigned nn = size/nblocks;
			
 
				 
			
 
				 	/*
			
 
				-	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
			
 
				+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
			
 
				 	 * double-antidiagonal number:
			
 
				 	 * - a=0 contains (0,0) plus (1,0)
			
 
				 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
			
@@ -285,41 +386,47 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
				 	{
			
 
				 		starpu_iteration_push(a);
			
 
				 
			
 
				-		for (k = 0; k < nblocks; k++)
			
 
				+		for (k = 0; k < (int) nblocks; k++)
			
 
				 		{
			
 
				 			n = k;
			
 
				 			/* Should be m = a-k-n; for potrf and trsm to respect
			
 
				 			   priorities, but needs to be this for dependencies */
			
 
				 			m = a-2*k-n;
			
 
				 
			
 
				-			if (m < 0 || m >= nblocks)
			
 
				-				continue;
			
 
				-
			
 
				 			if (m == n)
			
 
				 			{
			
 
				 				/* diagonal block, factorize */
			
 
				 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				 						       STARPU_RW, data_handles[k][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 						       0);
			
 
				 			}
			
 
				-			else
			
 
				+			else if (m >= n && m < (int) nblocks)
			
 
				 			{
			
 
				 				/* non-diagonal block, solve */
			
 
				 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 						       STARPU_R, data_handles[k][k],
			
 
				 						       STARPU_RW, data_handles[m][k],
			
 
				+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 						       0);
			
 
				 			}
			
 
				 
			
 
				+			if (m == (int) nblocks - 1)
			
 
				+			{
			
 
				+				/* We do not need the potrf result any more */
			
 
				+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
			
 
				+				starpu_data_wont_use(data_handles[n][n]);
			
 
				+			}
			
 
				+
			
 
				 			/* column within antidiagonal for a */
			
 
				-			for (n = k + 1; n < nblocks; n++)
			
 
				+			for (n = k + 1; n < (int) nblocks; n++)
			
 
				 			{
			
 
				 				/* row */
			
 
				 				m = a-2*k-n;
			
 
				 
			
 
				-				if (m >= n && m < nblocks)
			
 
				+				if (m >= n && m < (int) nblocks)
			
 
				 				{
			
 
				 					/* Update */
			
 
				 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
@@ -327,7 +434,14 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
				 							       STARPU_R, data_handles[n][k],
			
 
				 							       STARPU_R, data_handles[m][k],
			
 
				 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+							       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 							       0);
			
 
				+					if (m == (int) nblocks - 1)
			
 
				+					{
			
 
				+						/* Nobody else will need it */
			
 
				+						starpu_data_wont_use(data_handles[n][k]);
			
 
				+						starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 
			
@@ -335,12 +449,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
				 
			
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				-
			
 
				-	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				-	for (m = 0; m < nblocks; m++)
			
 
				-		for (n = 0; n < nblocks ; n++)
			
 
				-			starpu_data_wont_use(data_handles[m][n]);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -423,7 +531,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		*timing = end - start;
			
 
				-		*flops = (1.0f*size*size*size)/3.0f;
			
 
				+		*flops = FLOPS_SPOTRF(size);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -56,63 +56,66 @@ void parse_args(int argc, char **argv, int nodes)
 
				                         size = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-dblockx") == 0)
			
 
				+                else if (strcmp(argv[i], "-dblockx") == 0)
			
 
				                 {
			
 
				                         char *argptr;
			
 
				                         dblockx = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-dblocky") == 0)
			
 
				+                else if (strcmp(argv[i], "-dblocky") == 0)
			
 
				                 {
			
 
				                         char *argptr;
			
 
				                         dblocky = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+                else if (strcmp(argv[i], "-nblocks") == 0)
			
 
				                 {
			
 
				                         char *argptr;
			
 
				                         nblocks = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-nbigblocks") == 0)
			
 
				+                else if (strcmp(argv[i], "-nbigblocks") == 0)
			
 
				                 {
			
 
				                         char *argptr;
			
 
				                         nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-columns") == 0)
			
 
				+                else if (strcmp(argv[i], "-columns") == 0)
			
 
				                 {
			
 
				                         submission = COLUMNS;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-antidiagonals") == 0)
			
 
				+                else if (strcmp(argv[i], "-antidiagonals") == 0)
			
 
				                 {
			
 
				                         submission = ANTIDIAGONALS;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-prios") == 0)
			
 
				+                else if (strcmp(argv[i], "-prios") == 0)
			
 
				                 {
			
 
				                         submission = PRIOS;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-no-prio") == 0)
			
 
				+                else if (strcmp(argv[i], "-no-prio") == 0)
			
 
				                 {
			
 
				                         noprio = 1;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-check") == 0)
			
 
				+                else if (strcmp(argv[i], "-check") == 0)
			
 
				                 {
			
 
				                         check = 1;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-display") == 0)
			
 
				+                else if (strcmp(argv[i], "-display") == 0)
			
 
				                 {
			
 
				                         display = 1;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+                else
			
 
				+                /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) */
			
 
				                 {
			
 
				                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
			
 
				+                        fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
			
 
				+                        exit(0);
			
 
				                 }
			
 
				         }
			
 
				 
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -37,8 +37,8 @@
 
				 static unsigned long size = 4096;
			
 
				 static unsigned nblocks = 16;
			
 
				 static unsigned check = 0;
			
 
				-static int p = 1;
			
 
				-static int q = 1;
			
 
				+static int p = -1;
			
 
				+static int q = -1;
			
 
				 static unsigned display = 0;
			
 
				 static unsigned no_prio = 0;
			
 
				 
			
@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 
				 	int rank;
			
 
				 	int world_size;
			
 
				 	int ret;
			
 
				+	unsigned i, j;
			
 
				 
			
 
				 	/*
			
 
				 	 *	Initialization
			
@@ -462,7 +463,14 @@ int main(int argc, char **argv)
 
				 	/* We disable sequential consistency in this example */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				 
			
 
				-	STARPU_ASSERT(p*q == world_size);
			
 
				+	if (p == -1 && q==-1)
			
 
				+	{
			
 
				+		fprintf(stderr, "Setting default values for p and q\n");
			
 
				+		p = (q % 2 == 0) ? 2 : 1;
			
 
				+		q = world_size / p;
			
 
				+
			
 
				+	}
			
 
				+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
			
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
@@ -594,6 +602,18 @@ int main(int argc, char **argv)
 
				 	/*
			
 
				 	 * 	Termination
			
 
				 	 */
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
			
 
				+			TYPE *blockptr = dataA[j+i*nblocks];
			
 
				+			if (blockptr != STARPU_POISON_PTR)
			
 
				+				starpu_free(blockptr);
			
 
				+		}
			
 
				+	}
			
 
				+	free(dataA_handles);
			
 
				+	free(dataA);
			
 
				 
			
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
--- a/mpi/examples/mpi_lu/plu_implicit_example.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example.c
@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 
				 	int rank;
			
 
				 	int world_size;
			
 
				 	int ret;
			
 
				+	unsigned i, j;
			
 
				 
			
 
				 	starpu_srand48((long int)time(NULL));
			
 
				 
			
@@ -376,6 +377,18 @@ int main(int argc, char **argv)
 
				 	/*
			
 
				 	 * 	Termination
			
 
				 	 */
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
			
 
				+			TYPE *blockptr = dataA[j+i*nblocks];
			
 
				+			if (blockptr != STARPU_POISON_PTR)
			
 
				+				starpu_free(blockptr);
			
 
				+		}
			
 
				+	}
			
 
				+	free(dataA_handles);
			
 
				+	free(dataA);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_mpi_shutdown();
			
--- a/mpi/examples/mpi_lu/plu_outofcore_example.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example.c
@@ -39,9 +39,10 @@
 
				 
			
 
				 static unsigned long size = 4096;
			
 
				 static unsigned nblocks = 16;
			
 
				+static size_t blocksize;
			
 
				 static unsigned check = 0;
			
 
				-static int p = 1;
			
 
				-static int q = 1;
			
 
				+static int p = -1;
			
 
				+static int q = -1;
			
 
				 static unsigned display = 0;
			
 
				 static unsigned no_prio = 0;
			
 
				 static char *path = "./starpu-ooc-files";
			
@@ -53,6 +54,9 @@ static unsigned numa = 0;
 
				 static size_t allocated_memory = 0;
			
 
				 
			
 
				 static starpu_data_handle_t *dataA_handles;
			
 
				+static void **disk_objs;
			
 
				+
			
 
				+static int disk_node;
			
 
				 
			
 
				 int get_block_rank(unsigned i, unsigned j);
			
 
				 
			
@@ -142,7 +146,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnbl
 
				 
			
 
				 static void create_matrix()
			
 
				 {
			
 
				-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				 	TYPE *blockptr = malloc(blocksize);
			
 
				 	int fd;
			
 
				 	char *filename;
			
@@ -195,10 +198,9 @@ static void init_matrix(int rank)
 
				 {
			
 
				 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				+	disk_objs = calloc(nblocks*nblocks, sizeof(*disk_objs));
			
 
				 
			
 
				-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				-
			
 
				-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
			
 
				+	disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(16*1024*1024, size*size*sizeof(TYPE)));
			
 
				 	assert(disk_node >= 0);
			
 
				 
			
 
				 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
			
@@ -215,21 +217,21 @@ static void init_matrix(int rank)
 
				 
			
 
				 			if (block_rank == rank)
			
 
				 			{
			
 
				-				void *disk_obj;
			
 
				 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
			
 
				 				/* Register it to StarPU */
			
 
				-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
			
 
				-				if (!disk_obj)
			
 
				+				disk_objs[j+nblocks*i] = starpu_disk_open(disk_node, filename, blocksize);
			
 
				+				if (!disk_objs[j+nblocks*i])
			
 
				 				{
			
 
				 					fprintf(stderr,"could not open %s\n", filename);
			
 
				 					exit(1);
			
 
				 				}
			
 
				 				starpu_matrix_data_register(handleptr, disk_node,
			
 
				-					(uintptr_t) disk_obj, size/nblocks,
			
 
				+					(uintptr_t) disk_objs[j+nblocks*i], size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				+				disk_objs[j+nblocks*i] = NULL;
			
 
				 				starpu_matrix_data_register(handleptr, -1,
			
 
				 					0, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
@@ -273,6 +275,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				+	blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+
			
 
				 	ret = mkdir(path, 0777);
			
 
				 	if (ret != 0 && errno != EEXIST)
			
 
				 	{
			
@@ -286,7 +290,14 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
			
 
				 
			
 
				-	STARPU_ASSERT(p*q == world_size);
			
 
				+	if (p == -1 && q==-1)
			
 
				+	{
			
 
				+		fprintf(stderr, "Setting default values for p and q\n");
			
 
				+		p = (q % 2 == 0) ? 2 : 1;
			
 
				+		q = world_size / p;
			
 
				+
			
 
				+	}
			
 
				+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
			
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
@@ -401,8 +412,12 @@ int main(int argc, char **argv)
 
				 		for (i = 0; i < nblocks; i++)
			
 
				 		{
			
 
				 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
			
 
				+			if (disk_objs[j+nblocks*i])
			
 
				+				starpu_disk_close(disk_node, disk_objs[j+nblocks*i], blocksize);
			
 
				 		}
			
 
				 	}
			
 
				+	free(dataA_handles);
			
 
				+	free(disk_objs);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_mpi_shutdown();
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -203,7 +203,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 			else
			
 
				 			{
			
 
				 				STARPU_ASSERT(req->count);
			
 
				-				_STARPU_MPI_MALLOC(req->ptr, req->count);
			
 
				+				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
			
 
				 			}
			
 
				 
			
 
				 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
			
@@ -225,12 +225,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
			
 
				 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
			
 
				 
			
 
				-			/* Case: a receive request for a data with the given tag and source has already been
			
 
				-			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
			
 
				-			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
			
 
				-			 * will be called to bring the data back to the original data handle associated to the request.*/
			
 
				 			if (early_data_handle)
			
 
				 			{
			
 
				+				/* Case: a receive request for a data with the given tag and source has already been
			
 
				+				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
			
 
				+				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
			
 
				+				 * will be called to bring the data back to the original data handle associated to the request.*/
			
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
			
 
				 				while (!(early_data_handle->req_ready))
			
@@ -254,16 +254,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 
			
 
				 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
			
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
			
 
				+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
			
 
				 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 			}
			
 
				-			/* Case: no matching data has been received. Store the receive request as an early_request. */
			
 
				 			else
			
 
				 			{
			
 
				 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
			
 
				 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
			
 
				 				if (sync_req)
			
 
				 				{
			
 
				+					/* Case: we already received the send envelope, we can proceed with the receive */
			
 
				 					req->sync = 1;
			
 
				 					_starpu_mpi_datatype_allocate(req->data_handle, req);
			
 
				 					if (req->registered_datatype == 1)
			
@@ -275,14 +275,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 					{
			
 
				 						req->count = sync_req->count;
			
 
				 						STARPU_ASSERT(req->count);
			
 
				-						_STARPU_MPI_MALLOC(req->ptr, req->count);
			
 
				+						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
			
 
				 					}
			
 
				 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
			
 
				 					_STARPU_MPI_INC_READY_REQUESTS(+1);
			
 
				+					/* Throw away the dumb request that was only used to know that we got the envelope */
			
 
				 					_starpu_mpi_request_destroy(sync_req);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				+					/* Case: no matching data has been received. Store the receive request as an early_request. */
			
 
				 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
			
 
				 					_starpu_mpi_early_request_enqueue(req);
			
 
				 				}
			
@@ -684,6 +686,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 
			
 
				 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
			
 
				 
			
 
				+	STARPU_VALGRIND_YIELD();
			
 
				+
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
			
 
				 	if (*flag)
			
@@ -908,6 +912,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
 
				 
			
 
				+/* This is called when the data is now received in the early data handle, we can
			
 
				+ * now copy it over to the real handle. */
			
 
				 static void _starpu_mpi_early_data_cb(void* arg)
			
 
				 {
			
 
				 	struct _starpu_mpi_early_data_cb_args *args = arg;
			
@@ -954,7 +960,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 
				 	}
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
			
 
				-	starpu_data_release(args->early_handle);
			
 
				+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
			
 
				 	/* XXX: note that we have already freed the registered buffer above. In
			
@@ -1101,8 +1107,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 
				 		_starpu_mpi_req_list_push_back(&detached_requests, req);
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
			
 
				 
			
 
				-		starpu_wake_all_blocked_workers();
			
 
				-
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
@@ -1204,14 +1208,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				 	}
			
 
				 
			
 
				-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
			
 
				+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
			
 
				 	{
			
 
				 		char hostname[65];
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
			
 
				 	}
			
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				-	if (_starpu_mpi_thread_cpuid >= 0)
			
 
				+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid >= 0)
			
 
				 		/* In case MPI changed the binding */
			
 
				 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
			
 
				 #else
			
@@ -1450,7 +1454,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 						else
			
 
				 						{
			
 
				 							early_request->count = envelope->size;
			
 
				-							_STARPU_MPI_MALLOC(early_request->ptr, early_request->count);
			
 
				+							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
			
 
				 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
			
 
				 
			
 
				 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);
			
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -245,6 +245,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
			
 
				 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
			
 
				 
			
 
				+	STARPU_VALGRIND_YIELD();
			
 
				+
			
 
				 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
			
 
				 
			
 
				 	/* we must do a test_locked to avoid race condition :
			
@@ -344,7 +346,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 
				 				// req->ptr is freed by starpu_data_unpack
			
 
				 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
			
 
				 			else
			
 
				-				free(req->ptr);
			
 
				+				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -451,7 +453,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
			
 
				+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
			
 
				 	{
			
 
				 		char hostname[65];
			
 
				 		gethostname(hostname, sizeof(hostname));
			
@@ -623,7 +625,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				 	 * required for piom_ltask_set_bound_thread_indexes() */
			
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				 
			
 
				-	if (_starpu_mpi_thread_cpuid < 0)
			
 
				+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid < 0)
			
 
				 	{
			
 
				 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				 	}
			
@@ -633,7 +635,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				 	/* Tell pioman to use a bound thread for communication progression:
			
 
				 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
			
 
				 	int indexes[1] = { _starpu_mpi_thread_cpuid };
			
 
				-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
			
 
				+	if (!_starpu_mpi_nobind)
			
 
				+		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
			
 
				 
			
 
				 	/* Register some hooks for communication progress if needed */
			
 
				 	int polling_point_prog, polling_point_idle;
			
--- a/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
+++ b/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
@@ -130,7 +130,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 
				 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
			
 
				 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
			
 
				 
			
 
				-		req->ptr = malloc(req->count);
			
 
				+		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
			
 
				 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
			
 
				 
			
 
				 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);
			
--- a/mpi/src/starpu_mpi_coop_sends.c
+++ b/mpi/src/starpu_mpi_coop_sends.c
@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 
				 			/* We were last, release data */
			
 
				 			free(coop_sends->reqs_array);
			
 
				 			free(coop_sends);
			
 
				-			starpu_data_release(req->data_handle);
			
 
				+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		/* Trivial request */
			
 
				-		starpu_data_release(req->data_handle);
			
 
				+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -72,7 +72,7 @@ extern "C"
 
				 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
			
 
				 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
			
 
				-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
			
 
				+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
			
 
				 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
			
 
				 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\
			
--- a/mpi/src/starpu_mpi_private.c
+++ b/mpi/src/starpu_mpi_private.c
@@ -22,6 +22,7 @@ int _starpu_debug_level_max=0;
 
				 int _starpu_mpi_tag = 42;
			
 
				 int _starpu_mpi_comm_debug;
			
 
				 
			
 
				+int _starpu_mpi_nobind = -1;
			
 
				 int _starpu_mpi_thread_cpuid = -1;
			
 
				 int _starpu_mpi_use_prio = 1;
			
 
				 int _starpu_mpi_fake_world_size = -1;
			
@@ -62,6 +63,7 @@ void _starpu_mpi_env_init(void)
 
				         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
			
 
				 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
			
 
				 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
			
 
				+	_starpu_mpi_nobind = starpu_get_env_number_default("STARPU_MPI_NOBIND", 0);
			
 
				 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
			
 
				 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
			
 
				 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -61,6 +61,7 @@ void _starpu_mpi_set_debug_level_max(int level);
 
				 extern int _starpu_mpi_fake_world_size;
			
 
				 extern int _starpu_mpi_fake_world_rank;
			
 
				 extern int _starpu_mpi_use_prio;
			
 
				+extern int _starpu_mpi_nobind;
			
 
				 extern int _starpu_mpi_thread_cpuid;
			
 
				 extern int _starpu_mpi_use_coop_sends;
			
 
				 extern int _starpu_mpi_mem_throttle;
			
@@ -200,7 +201,7 @@ struct _starpu_mpi_data
 
				 {
			
 
				 	int magic;
			
 
				 	struct _starpu_mpi_node_tag node_tag;
			
 
				-	int *cache_sent;
			
 
				+	char *cache_sent;
			
 
				 	int cache_received;
			
 
				 
			
 
				 	/** Rendez-vous data for opportunistic cooperative sends */
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -45,10 +45,10 @@ endif
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 if STARPU_MPI_CHECK
			
--- a/mpi/tests/broadcast.c
+++ b/mpi/tests/broadcast.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 	starpu_data_handle_t handle;
			
 
				-	int var;
			
 
				+	int var=-1;
			
 
				 	int mpi_init;
			
 
				 	MPI_Status status;
			
 
				 
			
--- a/mpi/tests/early_request.c
+++ b/mpi/tests/early_request.c
@@ -109,6 +109,18 @@ void submitted_order_fun(void *buffers[], void *cl_arg)
 
				 	(void)cl_arg;
			
 
				 }
			
 
				 
			
 
				+static struct starpu_codelet submitted_order_rw =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {submitted_order_fun, NULL},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW},
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	.model = &starpu_perfmodel_nop,
			
 
				+#endif
			
 
				+	.name = "submitted_order_enforcer"
			
 
				+};
			
 
				+
			
 
				 static struct starpu_codelet submitted_order =
			
 
				 {
			
 
				 	.where = STARPU_CPU,
			
@@ -156,11 +168,15 @@ void insert_work_for_one_element(struct element *el)
 
				 			   STARPU_W,tmp_send,
			
 
				 			   0);
			
 
				 	//Send operation
			
 
				-	starpu_insert_task(&submitted_order,
			
 
				+	starpu_insert_task(&submitted_order_rw,
			
 
				 			   STARPU_RW,el->ensure_submitted_order_send,
			
 
				-			   STARPU_W,tmp_send,
			
 
				+			   STARPU_RW,tmp_send,
			
 
				 			   0);
			
 
				 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
			
 
				+	starpu_insert_task(&submitted_order_rw,
			
 
				+			   STARPU_RW,el->ensure_submitted_order_send,
			
 
				+			   STARPU_RW,tmp_send,
			
 
				+			   0);
			
 
				 
			
 
				 	//Recv operation for current element
			
 
				 	starpu_insert_task(&submitted_order,
			
--- a/mpi/tests/insert_task_compute.c
+++ b/mpi/tests/insert_task_compute.c
@@ -47,7 +47,7 @@ int test(int rank, int node, int *before, int *after, int task_insert, int data_
 
				 	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				-	if (starpu_cpu_worker_get_count() <= 0)
			
 
				+	if (starpu_cpu_worker_get_count() == 0)
			
 
				 	{
			
 
				 		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
			
 
				 		ret = -ENODEV;
			
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -43,13 +43,14 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 	int mpi_init;
			
 
				+	int i;
			
 
				 
			
 
				 	int niter = DEFAULT_NITER;
			
 
				 	int data_size = DEFAULT_DATA_SIZE;
			
 
				 	int sleep_time = DEFAULT_SLEEP_TIME;
			
 
				 	int method = DEFAULT_METHOD;
			
 
				 
			
 
				-	for (int i = 1; i < argc; i++)
			
 
				+	for (i = 1; i < argc; i++)
			
 
				 	{
			
 
				 		if (strcmp(argv[i], "-n") == 0)
			
 
				 		{
			
@@ -134,6 +135,7 @@ int main(int argc, char **argv)
 
				 	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 	int sender;
			
 
				+	int r = 0;
			
 
				 
			
 
				 	if (method == 0) // ping pongs
			
 
				 	{
			
@@ -161,7 +163,7 @@ int main(int argc, char **argv)
 
				 			sender = loop % size;
			
 
				 			if (sender == rank)
			
 
				 			{
			
 
				-				for (int r = 0; r < size; r++)
			
 
				+				for (r = 0; r < size; r++)
			
 
				 				{
			
 
				 					if (r != rank)
			
 
				 					{
			
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 
				 				MPI_Status status;
			
 
				 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
			
 
				 
			
 
				-				for (int r = 0; r < (size-1); r++)
			
 
				+				for (r = 0; r < (size-1); r++)
			
 
				 					starpu_sleep(sleep_time / 1000);
			
 
				 			}
			
 
				 		}
			
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -27,6 +27,7 @@ unsigned long _starpu_job_cnt = 0;
 
				 #ifdef STARPU_USE_FXT
			
 
				 #include <common/fxt.h>
			
 
				 #include <starpu_fxt.h>
			
 
				+#include <sys/stat.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
 
				 #include <windows.h>
			
@@ -95,6 +96,16 @@ static void _starpu_profile_set_tracefile(void)
 
				 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
			
 
				 	if (!fxt_prefix)
			
 
				 	     fxt_prefix = "/tmp/";
			
 
				+	else
			
 
				+	{
			
 
				+		// Check if the given folder really exists:
			
 
				+		struct stat folder_stat;
			
 
				+		if (stat(fxt_prefix, &folder_stat) < 0 || !S_ISDIR(folder_stat.st_mode))
			
 
				+		{
			
 
				+			_STARPU_MSG("%s is not a valid directory.\n", fxt_prefix);
			
 
				+			_starpu_abort();
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	user = starpu_getenv("USER");
			
 
				 	if (!user)
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -85,8 +85,10 @@
 
				 #define _STARPU_UYIELD() ((void)0)
			
 
				 #endif
			
 
				 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
			
 
				+#define STARPU_VALGRIND_YIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); } while (0)
			
 
				 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
			
 
				 #else
			
 
				+#define STARPU_VALGRIND_YIELD() do { } while (0)
			
 
				 #define STARPU_UYIELD() _STARPU_UYIELD()
			
 
				 #endif
			
 
				 
			
--- a/src/core/dependencies/cg.c
+++ b/src/core/dependencies/cg.c
@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 
				 	return n;
			
 
				 }
			
 
				 
			
 
				-/* Note: in case of a tag, it must be already locked */
			
 
				 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
			
 
				 {
			
 
				 	STARPU_ASSERT(cg);
			
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 
				 				struct _starpu_tag *tag;
			
 
				 
			
 
				 				tag = cg->succ.tag;
			
 
				+				_starpu_spin_lock(&tag->lock);
			
 
				 				tag_successors = &tag->tag_successors;
			
 
				 
			
 
				 				tag_successors->ndeps_completed++;
			
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 
				 				{
			
 
				 					/* reset the counter so that we can reuse the completion group */
			
 
				 					tag_successors->ndeps_completed = 0;
			
 
				+					/* This releases the lock */
			
 
				 					_starpu_tag_set_ready(tag);
			
 
				-				}
			
 
				+				} else
			
 
				+					_starpu_spin_unlock(&tag->lock);
			
 
				 				break;
			
 
				 			}
			
 
				 
			
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 
				 			successors->nsuccs--;
			
 
				 		}
			
 
				 		_starpu_spin_unlock(&successors->lock);
			
 
				-
			
 
				-		struct _starpu_tag *cgtag = NULL;
			
 
				-
			
 
				-		if (cg_type == STARPU_CG_TAG)
			
 
				-		{
			
 
				-			cgtag = cg->succ.tag;
			
 
				-			STARPU_ASSERT(cgtag);
			
 
				-			_starpu_spin_lock(&cgtag->lock);
			
 
				-		}
			
 
				-
			
 
				 		_starpu_notify_cg(pred, cg);
			
 
				-
			
 
				-		if (cg_type == STARPU_CG_TAG)
			
 
				-			_starpu_spin_unlock(&cgtag->lock);
			
 
				-
			
 
				 		_starpu_spin_lock(&successors->lock);
			
 
				 	}
			
 
				 	successors->terminated = 1;
			
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
				 }
			
 
				 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
			
 
				 #else // LOCK_OR_DELEGATE
			
 
				-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
			
 
				+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
			
 
				 #endif
			
 
				 {
			
 
				 	starpu_arbiter_t arbiter = handle->arbiter;
			
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
				 	{
			
 
				 		/* No waiter, just remove our reference */
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				-		STARPU_ASSERT(handle->refcnt > 0);
			
 
				-		handle->refcnt--;
			
 
				-		STARPU_ASSERT(handle->busy_count > 0);
			
 
				-		handle->busy_count--;
			
 
				+		if (down_to_mode == STARPU_NONE)
			
 
				+		{
			
 
				+			STARPU_ASSERT(handle->refcnt > 0);
			
 
				+			handle->refcnt--;
			
 
				+			STARPU_ASSERT(handle->busy_count > 0);
			
 
				+			handle->busy_count--;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* Downgrade from W or RW down to R, keeping the same reference,
			
 
				+			 * but thus allowing other readers without allowing writers.  */
			
 
				+			STARPU_ASSERT(down_to_mode == STARPU_R &&
			
 
				+				      handle->current_mode == STARPU_W);
			
 
				+			handle->current_mode = down_to_mode;
			
 
				+		}
			
 
				 #ifndef LOCK_OR_DELEGATE
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
			
 
				 #endif
			
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
				 
			
 
				 	/* There is a waiter, remove our reference */
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				-	STARPU_ASSERT(handle->refcnt > 0);
			
 
				-	handle->refcnt--;
			
 
				-	STARPU_ASSERT(handle->busy_count > 0);
			
 
				-	handle->busy_count--;
			
 
				+	if (down_to_mode == STARPU_NONE)
			
 
				+	{
			
 
				+		STARPU_ASSERT(handle->refcnt > 0);
			
 
				+		handle->refcnt--;
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Downgrade from W or RW down to R, keeping the same reference,
			
 
				+		 * but thus allowing other readers without allowing writers.  */
			
 
				+		STARPU_ASSERT(down_to_mode == STARPU_R &&
			
 
				+			      handle->current_mode == STARPU_W);
			
 
				+		handle->current_mode = down_to_mode;
			
 
				+	}
			
 
				 	/* There should be at least one busy_count reference for the waiter
			
 
				 	 * (thus we don't risk to see the handle disappear below) */
			
 
				 	STARPU_ASSERT(handle->busy_count > 0);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
 
				  * This may free the handle if it was lazily unregistered (1 is returned in
			
 
				  * that case). The handle pointer thus becomes invalid for the caller.
			
 
				  */
			
 
				-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
			
 
				+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
			
 
				 {
			
 
				 	_starpu_spin_checklocked(&handle->header_lock);
			
 
				 
			
 
				+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
			
 
				+	{
			
 
				+		/* No change, nothing to do */
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				 	if (handle->arbiter)
			
 
				 	{
			
 
				 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
			
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 
				 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
			
 
				-		_starpu_notify_arbitered_dependencies(handle);
			
 
				+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
			
 
				 		/* We have already unlocked */
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				-	/* A data access has finished so we remove a reference. */
			
 
				-	STARPU_ASSERT(handle->refcnt > 0);
			
 
				-	handle->refcnt--;
			
 
				-	STARPU_ASSERT(handle->busy_count > 0);
			
 
				-	handle->busy_count--;
			
 
				-	if (_starpu_data_check_not_busy(handle))
			
 
				-		/* Handle was destroyed, nothing left to do.  */
			
 
				-		return 1;
			
 
				-
			
 
				 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
			
 
				 
			
 
				+	if (down_to_mode == STARPU_NONE)
			
 
				+	{
			
 
				+		/* A data access has finished so we remove a reference. */
			
 
				+		STARPU_ASSERT(handle->refcnt > 0);
			
 
				+		handle->refcnt--;
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				+		if (_starpu_data_check_not_busy(handle))
			
 
				+			/* Handle was destroyed, nothing left to do.  */
			
 
				+			return 1;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Downgrade from W or RW down to R, keeping the same reference,
			
 
				+		 * but thus allowing other readers without allowing writers.  */
			
 
				+		STARPU_ASSERT(down_to_mode == STARPU_R &&
			
 
				+				(handle->current_mode == STARPU_RW ||
			
 
				+				 handle->current_mode == STARPU_W));
			
 
				+		handle->current_mode = down_to_mode;
			
 
				+	}
			
 
				+
			
 
				 	/* In case there is a pending reduction, and that this is the last
			
 
				 	 * requester, we may go back to a "normal" coherency model. */
			
 
				 	if (handle->reduction_refcnt > 0)
			
--- a/src/core/dependencies/data_concurrency.h
+++ b/src/core/dependencies/data_concurrency.h
@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 
				 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
			
 
				 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
			
 
				 
			
 
				-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
			
 
				-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
			
 
				+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
			
 
				+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
			
 
				 
			
 
				 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
			
 
				 							  enum starpu_data_access_mode mode,
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -225,10 +225,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
				 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
			
 
				 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
			
 
				 
			
 
				+		if (mode & STARPU_R)
			
 
				+			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
			
 
				+
			
 
				 		if (mode & STARPU_W || mode == STARPU_REDUX)
			
 
				 		{
			
 
				 
			
 
				-			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handles can not be written to");
			
 
				+			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handle %p can not be written to", handle);
			
 
				 
			
 
				 			handle->initialized = 1;
			
 
				 			/* We will change our value, disconnect from our readonly duplicates */
			
@@ -613,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
 
				         _STARPU_LOG_OUT();
			
 
				 }
			
 
				 
			
 
				-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
			
 
				+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				 {
			
 
				 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
			
 
				 	unsigned do_submit_tasks = 0;
			
 
				+	unsigned last_cnt;
			
 
				 
			
 
				 	/* Here helgrind would shout that this is an unprotected access, but
			
 
				 	 * count can only be zero if we don't have to care about
			
 
				 	 * post_sync_tasks_cnt at all.  */
			
 
				-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
			
 
				+	if (handle->post_sync_tasks_cnt)
			
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				-		if (--handle->post_sync_tasks_cnt == 0)
			
 
				+		last_cnt = handle->post_sync_tasks_cnt;
			
 
				+
			
 
				+		if (mode == STARPU_NONE)
			
 
				+			/* Last release from us */
			
 
				+			handle->post_sync_tasks_cnt--;
			
 
				+
			
 
				+		if (last_cnt == 1)
			
 
				 		{
			
 
				 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
			
 
				 			do_submit_tasks = 1;
			
--- a/src/core/dependencies/implicit_data_deps.h
+++ b/src/core/dependencies/implicit_data_deps.h
@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 
				 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
			
 
				 
			
 
				 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
			
 
				-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
			
 
				+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
			
 
				 
			
 
				 /** Register a hook to be called when a write is submitted */
			
 
				 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 
				 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
			
 
				 
			
 
				 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
			
 
				+			{
			
 
				 				/* Last tag this cg depends on, cg becomes unreferenced */
			
 
				+#ifdef STARPU_DEBUG
			
 
				+				free(cg->deps);
			
 
				+				free(cg->done);
			
 
				+#endif
			
 
				 				free(cg);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 #ifdef STARPU_DYNAMIC_DEPS_SIZE
			
 
				 		free(tag->tag_successors.succ);
			
 
				 #endif
			
 
				+#ifdef STARPU_DEBUG
			
 
				+		free(tag->tag_successors.deps);
			
 
				+		free(tag->tag_successors.done);
			
 
				+#endif
			
 
				 
			
 
				 		_starpu_spin_unlock(&tag->lock);
			
 
				 		_starpu_spin_destroy(&tag->lock);
			
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 
				 	return tag;
			
 
				 }
			
 
				 
			
 
				-/* lock should be taken */
			
 
				+/* lock should be taken, and this releases it */
			
 
				 void _starpu_tag_set_ready(struct _starpu_tag *tag)
			
 
				 {
			
 
				 	/* mark this tag as ready to run */
			
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 
				 	/* declare it to the scheduler ! */
			
 
				 	struct _starpu_job *j = tag->job;
			
 
				 
			
 
				+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
			
 
				+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
			
 
				+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
			
 
				+
			
 
				 	/* In case the task job is going to be scheduled immediately, and if
			
 
				 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
			
 
				 	 * the dependencies of the task, and therefore it would try to grab the
			
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 
				 	/* enforce data dependencies */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				 	_starpu_enforce_deps_starting_from_task(j);
			
 
				-
			
 
				-	_starpu_spin_lock(&tag->lock);
			
 
				-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
			
 
				-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
			
 
				-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
			
 
				 }
			
 
				 
			
 
				-/* the lock must be taken ! */
			
 
				+/* the lock of the tag must already be taken ! */
			
 
				 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
			
 
				 {
			
 
				 	STARPU_ASSERT(tag);
			
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 
				 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
			
 
				 		STARPU_ASSERT(tag_dep != tag_child);
			
 
				 		_starpu_spin_lock(&tag_dep->lock);
			
 
				-		_starpu_spin_lock(&tag_child->lock);
			
 
				 		_starpu_tag_add_succ(tag_dep, cg);
			
 
				 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
			
 
				 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
			
 
				 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
			
 
				-		_starpu_spin_unlock(&tag_child->lock);
			
 
				 		_starpu_spin_unlock(&tag_dep->lock);
			
 
				 	}
			
 
				 }
			
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 
				 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
			
 
				 		STARPU_ASSERT(tag_dep != tag_child);
			
 
				 		_starpu_spin_lock(&tag_dep->lock);
			
 
				-		_starpu_spin_lock(&tag_child->lock);
			
 
				 		_starpu_tag_add_succ(tag_dep, cg);
			
 
				 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
			
 
				 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
			
 
				 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
			
 
				-		_starpu_spin_unlock(&tag_child->lock);
			
 
				 		_starpu_spin_unlock(&tag_dep->lock);
			
 
				 	}
			
 
				 	va_end(pa);
			
--- a/src/core/dependencies/tags.h
+++ b/src/core/dependencies/tags.h
@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 
				 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
			
 
				 
			
 
				 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
			
 
				+
			
 
				+/* lock should be taken, and this releases it */
			
 
				 void _starpu_tag_set_ready(struct _starpu_tag *tag);
			
 
				 
			
 
				 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
				 
			
 
				 	job->task = task;
			
 
				 
			
 
				-#ifndef STARPU_USE_FXT
			
 
				+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
			
 
				 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
			
 
				 #endif
			
 
				 	{
			
--- a/src/core/perfmodel/multiple_regression.c
+++ b/src/core/perfmodel/multiple_regression.c
@@ -236,6 +236,9 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, unsigned long nn, unsigne
 
				 	if( info != 0 )
			
 
				 	{
			
 
				 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
			
 
				+		free(X);
			
 
				+		free(Y);
			
 
				+		free(work);
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -862,7 +862,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
				 
			
 
				 /* in case the data was accessed on a write mode, do not forget to
			
 
				  * make it accessible again once it is possible ! */
			
 
				-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
			
 
				+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				 	uint32_t wt_mask;
			
 
				 	wt_mask = default_wt_mask | handle->wt_mask;
			
@@ -887,14 +887,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	/* Release refcnt taken by fetch_data_on_node */
			
 
				-	replicate->refcnt--;
			
 
				-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
			
 
				+	if (down_to_mode == STARPU_NONE)
			
 
				+	{
			
 
				+		/* Release refcnt taken by fetch_data_on_node */
			
 
				+		replicate->refcnt--;
			
 
				+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
			
 
				 
			
 
				-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
			
 
				-	handle->busy_count--;
			
 
				+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
			
 
				+		handle->busy_count--;
			
 
				+	}
			
 
				 
			
 
				-	if (!_starpu_notify_data_dependencies(handle))
			
 
				+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
@@ -1229,7 +1232,7 @@ enomem:
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				 
			
 
				-		_starpu_release_data_on_node(handle, 0, local_replicate);
			
 
				+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
			
 
				 	}
			
 
				 
			
 
				 	return -1;
			
@@ -1338,13 +1341,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 
				 		if (node == -1)
			
 
				 		{
			
 
				 			/* NOWHERE case, just notify dependencies */
			
 
				-			if (!_starpu_notify_data_dependencies(handle))
			
 
				+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
			
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				-			_starpu_release_data_on_node(handle, 0, local_replicate);
			
 
				+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 /** This releases a reference on the handle */
			
 
				 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
			
 
				+				  enum starpu_data_access_mode down_to_mode,
			
 
				 				  struct _starpu_data_replicate *replicate);
			
 
				 
			
 
				 void _starpu_update_data_state(starpu_data_handle_t handle,
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -375,8 +375,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
			
 
				 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
			
 
				 
			
 
				-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
			
 
				-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
			
 
				+	if (ld2_src == blocksize * numblocks_1 &&
			
 
				+	    ld2_dst == blocksize * numblocks_1)
			
 
				 		/* Optimize contiguous case */
			
 
				 		return starpu_interface_copy(src, src_offset, src_node,
			
 
				 					     dst, dst_offset, dst_node,
			
@@ -425,8 +425,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
			
 
				 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
			
 
				 
			
 
				-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
			
 
				-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
			
 
				+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
			
 
				+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
			
 
				 		/* Optimize contiguous case */
			
 
				 		return starpu_interface_copy(src, src_offset, src_node,
			
 
				 					     dst, dst_offset, dst_node,
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
				 	{
			
 
				 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
			
 
				+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				-		if (!_starpu_notify_data_dependencies(handle))
			
 
				+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				 }
			
@@ -836,6 +836,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 	int sequential_consistency = handle->sequential_consistency;
			
 
				 	if (sequential_consistency && !nowait)
			
 
				 	{
			
 
				+		/* We will acquire it in write mode to catch all dependencies,
			
 
				+		 * but possibly it's not actually initialized. Fake it to avoid
			
 
				+		 getting caught doing it */
			
 
				+		handle->initialized = 1;
			
 
				 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
			
 
				 
			
 
				 		/* If sequential consistency is enabled, wait until data is available */
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
				 
			
 
				 /* This function must be called after starpu_data_acquire so that the
			
 
				  * application release the data */
			
 
				-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
			
 
				+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				+	if (mode == STARPU_RW)
			
 
				+		/* They are equivalent here, and current_mode is never STARPU_RW */
			
 
				+		mode = STARPU_W;
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
			
 
				+			  mode == handle->current_mode ||
			
 
				+			  (mode == STARPU_R &&
			
 
				+			     handle->current_mode == STARPU_W),
			
 
				+		"We only support releasing from W to R");
			
 
				+
			
 
				 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
			
 
				-	_starpu_unlock_post_sync_tasks(handle);
			
 
				+	_starpu_unlock_post_sync_tasks(handle, mode);
			
 
				 
			
 
				 	/* The application can now release the rw-lock */
			
 
				 	if (node >= 0)
			
 
				-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
			
 
				+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
			
 
				 	else
			
 
				 	{
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
				 				handle->per_node[i].refcnt--;
			
 
				 		}
			
 
				 		handle->busy_count--;
			
 
				-		if (!_starpu_notify_data_dependencies(handle))
			
 
				+		if (!_starpu_notify_data_dependencies(handle, mode))
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_data_release(starpu_data_handle_t handle)
			
 
				+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
			
 
				+{
			
 
				+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				 {
			
 
				 	int home_node = handle->home_node;
			
 
				 	if (home_node < 0)
			
 
				 		home_node = STARPU_MAIN_RAM;
			
 
				-	starpu_data_release_on_node(handle, home_node);
			
 
				+	starpu_data_release_to_on_node(handle, mode, home_node);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_release(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	starpu_data_release_to(handle, STARPU_NONE);
			
 
				 }
			
 
				 
			
 
				 static void _prefetch_data_on_node(void *arg)
			
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 
				 		_starpu_data_acquire_wrapper_finished(wrapper);
			
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				-	if (!_starpu_notify_data_dependencies(handle))
			
 
				+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 		/* In case there was a temporary handle (eg. used for reduction), this
			
 
				 		 * handle may have requested to be destroyed when the data is released
			
 
				 		 * */
			
 
				-		if (!_starpu_notify_data_dependencies(handle))
			
 
				+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				 	else if (!async)
			
@@ -666,6 +686,9 @@ static void _starpu_data_wont_use(void *data)
 
				 
			
 
				 void starpu_data_wont_use(starpu_data_handle_t handle)
			
 
				 {
			
 
				+	if (!handle->initialized)
			
 
				+		/* No value atm actually */
			
 
				+		return;
			
 
				 	_STARPU_TRACE_DATA_WONT_USE(handle);
			
 
				 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
			
 
				 }
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 
				 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
			
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				-	if (!_starpu_notify_data_dependencies(handle))
			
 
				+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
			
 
				 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				-		_starpu_release_data_on_node(handle, node0, replicate_0);
			
 
				+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
			
 
				 
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 		handle->refcnt++;
			
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
			
 
				 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				-		_starpu_release_data_on_node(handle, node1, replicate_1);
			
 
				+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
			
 
				 	}
			
 
				 }
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -333,7 +333,7 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
				 	return data;
			
 
				 }
			
 
				 
			
 
				-static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+static void handle_papi_event(struct fxt_ev_64 *ev STARPU_ATTRIBUTE_UNUSED, struct starpu_fxt_options *options STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_PAPI
			
 
				 	int event_code = ev->param[0];
			
@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
			
 
				 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
			
 
				 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
			
 
				-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
			
 
				+			char str_handle[STARPU_POTI_STR_LEN];
			
 
				+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
			
 
				+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
			
 
				 #else
			
 
				-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
			
 
				+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
			
 
				 #endif
			
 
				 		}
			
 
				 
			
@@ -2636,10 +2638,12 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 
				                char paje_value[STARPU_POTI_STR_LEN];
			
 
				                snprintf(paje_value, sizeof(paje_value), "%u", task);
			
 
				                snprintf(container, sizeof(container), "%sp", options->file_prefix);
			
 
				-               poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
			
 
				+		if (!options->no_events)
			
 
				+			poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
			
 
				 #else
			
 
				-	       fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
			
 
				-               fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
			
 
				+		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
			
 
				+		if (!options->no_events)
			
 
				+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
			
 
				 #endif
			
 
				 	}
			
 
				 
			
@@ -2652,9 +2656,9 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 
				 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
			
 
				 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
			
 
				 		if (options->file_rank < 0)
			
 
				-			fprintf(sched_tasks_file, "JobId: %d\n", task);
			
 
				+			fprintf(sched_tasks_file, "JobId: %u\n", task);
			
 
				 		else
			
 
				-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
			
 
				+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
			
 
				 		fprintf(sched_tasks_file, "\n");
			
 
				 	}
			
 
				 }
			
@@ -2681,11 +2685,13 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 
				 		char paje_value[STARPU_POTI_STR_LEN];
			
 
				 		snprintf(paje_value, sizeof(paje_value), "%u", task);
			
 
				 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
			
 
				-		poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
			
 
				+		if (!options->no_events)
			
 
				+			poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
			
 
				 #else
			
 
				 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
			
 
				 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
			
 
				-		fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
			
 
				+		if (!options->no_events)
			
 
				+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
			
 
				 #endif
			
 
				 	}
			
 
				 
			
@@ -2701,9 +2707,9 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 
				 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
			
 
				 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
			
 
				 		if (options->file_rank < 0)
			
 
				-			fprintf(sched_tasks_file, "JobId: %d\n", task);
			
 
				+			fprintf(sched_tasks_file, "JobId: %u\n", task);
			
 
				 		else
			
 
				-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
			
 
				+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
			
 
				 		fprintf(sched_tasks_file, "\n");
			
 
				 	}
			
 
				 }
			
@@ -4450,7 +4456,7 @@ void _starpu_fxt_number_events_file_close(void)
 
				 		for (i = 0; i <= FUT_SETUP_CODE; i++)
			
 
				 		{
			
 
				 			if (number_events[i] > 0)
			
 
				-				fprintf(number_events_file, "0x%x\t%lu\n", i, number_events[i]);
			
 
				+				fprintf(number_events_file, "0x%x\t%"PRIu64"\n", i, number_events[i]);
			
 
				 		}
			
 
				 
			
 
				 		free(number_events);
			
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 
				 extern int _starpu_poti_extendedSetState;
			
 
				 extern int _starpu_poti_semiExtendedSetState;
			
 
				 extern int _starpu_poti_MemoryEvent;
			
 
				+extern int _starpu_poti_CommLinkStart;
			
 
				 extern int _starpu_poti_MpiLinkStart;
			
 
				 
			
 
				 /*
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 
			
 
				 			char str_mpi_tag[STARPU_POTI_STR_LEN];
			
 
				 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
			
 
				-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
			
 
				+			char str_handle[STARPU_POTI_STR_LEN];
			
 
				+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
			
 
				+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
			
 
				 
			
 
				 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
			
 
				 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
			
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 #else
			
 
				 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
			
 
				 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
			
 
				-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
			
 
				+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
			
 
				 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
			
 
				 #endif
			
 
				 
			
@@ -354,10 +356,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 				fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
			
 
				 				fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
			
 
				 				if (cur->jobid != -1)
			
 
				-					fprintf(out_comms_file, "SendJobId: %d_%lu\n", src, cur->jobid);
			
 
				+					fprintf(out_comms_file, "SendJobId: %d_%ld\n", src, cur->jobid);
			
 
				 				if (match->jobid != -1)
			
 
				-					fprintf(out_comms_file, "RecvJobId: %d_%lu\n", dst, match->jobid);
			
 
				-				fprintf(out_comms_file, "Size: %ld\n", size);
			
 
				+					fprintf(out_comms_file, "RecvJobId: %d_%ld\n", dst, match->jobid);
			
 
				+				fprintf(out_comms_file, "Size: %lu\n", (unsigned long)size);
			
 
				 				fprintf(out_comms_file, "\n");
			
 
				 			}
			
 
				 		}
			
@@ -372,7 +374,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 	if (nb_wrong_comm_timing == 1)
			
 
				 		_STARPU_MSG("Warning: a communication finished before it started !\n");
			
 
				 	else if (nb_wrong_comm_timing > 1)
			
 
				-		_STARPU_MSG("Warning: %d communications finished before they started !\n", nb_wrong_comm_timing);
			
 
				+		_STARPU_MSG("Warning: %u communications finished before they started !\n", nb_wrong_comm_timing);
			
 
				 }
			
 
				 
			
 
				 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -28,6 +28,7 @@
 
				 int _starpu_poti_extendedSetState = -1;
			
 
				 int _starpu_poti_semiExtendedSetState = -1;
			
 
				 int _starpu_poti_MemoryEvent = -1;
			
 
				+int _starpu_poti_CommLinkStart = -1;
			
 
				 int _starpu_poti_MpiLinkStart = -1;
			
 
				 #endif
			
 
				 #endif
			
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 						     "SubmitOrder string"
			
 
				 						     );
			
 
				 #ifdef HAVE_POTI_USER_NEWEVENT
			
 
				+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
			
 
				 	if (options->memory_states)
			
 
				 	{
			
 
				 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
			
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 							     "Size string",
			
 
				 							     "Dest string");
			
 
				 	}
			
 
				-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
			
 
				+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
			
 
				 #endif
			
 
				 #else
			
 
				 	poti_header(1,1);
			
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 	fprintf(file, "%%	StartContainer	string\n");
			
 
				 	fprintf(file, "%%	Key	string\n");
			
 
				 	fprintf(file, "%%	MPITAG	string\n");
			
 
				+	fprintf(file, "%%	Handle	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
			
 
				+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
			
 
				 	fprintf(file, "%%	Time	date\n");
			
 
				 	fprintf(file, "%%	Type	string\n");
			
 
				 	fprintf(file, "%%	Container	string\n");
			
 
				 	fprintf(file, "%%	Value	string\n");
			
 
				-	fprintf(file, "%%	EndContainer	string\n");
			
 
				+	fprintf(file, "%%	StartContainer	string\n");
			
 
				 	fprintf(file, "%%	Key	string\n");
			
 
				-	fprintf(file, "%%	MPITAG	string\n");
			
 
				+	fprintf(file, "%%	Handle	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				 #endif
			
 
				 
			
--- a/src/sched_policies/component_work_stealing.c
+++ b/src/sched_policies/component_work_stealing.c
@@ -30,14 +30,20 @@
 
				 #warning TODO: locality work-stealing
			
 
				 #endif
			
 
				 
			
 
				+struct _starpu_component_work_stealing_data_per_worker
			
 
				+{
			
 
				+	struct _starpu_prio_deque fifo;
			
 
				+	unsigned last_pop_child;
			
 
				+};
			
 
				+
			
 
				 struct _starpu_component_work_stealing_data
			
 
				 {
			
 
				 /* keep track of the work performed from the beginning of the algorithm to make
			
 
				  * better decisions about which queue to child when stealing or deferring work
			
 
				  */
			
 
				-	unsigned performed_total, last_pop_child, last_push_child;
			
 
				+	struct _starpu_component_work_stealing_data_per_worker *per_worker;
			
 
				+	unsigned performed_total, last_push_child;
			
 
				 
			
 
				-	struct _starpu_prio_deque * fifos;
			
 
				 	starpu_pthread_mutex_t ** mutexes;
			
 
				 	unsigned size;
			
 
				 };
			
@@ -50,16 +56,14 @@ struct _starpu_component_work_stealing_data
 
				 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
			
 
				 {
			
 
				 	struct _starpu_component_work_stealing_data *wsd = component->data;
			
 
				-	STARPU_HG_DISABLE_CHECKING(wsd->last_pop_child);
			
 
				-	unsigned i = wsd->last_pop_child;
			
 
				-	wsd->last_pop_child = (i + 1) % component->nchildren;
			
 
				-	STARPU_HG_ENABLE_CHECKING(wsd->last_pop_child);
			
 
				+	unsigned i = wsd->per_worker[workerid].last_pop_child;
			
 
				+	wsd->per_worker[workerid].last_pop_child = (i + 1) % component->nchildren;
			
 
				 	/* If the worker's queue have no suitable tasks, let's try
			
 
				 	 * the next ones */
			
 
				 	struct starpu_task * task = NULL;
			
 
				 	while (1)
			
 
				 	{
			
 
				-		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
			
 
				+		struct _starpu_prio_deque * fifo = &wsd->per_worker[i].fifo;
			
 
				 
			
 
				 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
			
@@ -75,7 +79,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (i == wsd->last_pop_child)
			
 
				+		if (i == wsd->per_worker[workerid].last_pop_child)
			
 
				 		{
			
 
				 			/* We got back to the first worker,
			
 
				 			 * don't go in infinite loop */
			
@@ -141,17 +145,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 
				 	struct _starpu_component_work_stealing_data * wsd = component->data;
			
 
				 	const double now = starpu_timing_now();
			
 
				 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				-	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
			
 
				+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->per_worker[i].fifo);
			
 
				 	if(task)
			
 
				 	{
			
 
				 		if(!isnan(task->predicted))
			
 
				 		{
			
 
				-			wsd->fifos[i].exp_len -= task->predicted;
			
 
				-			wsd->fifos[i].exp_start = now + task->predicted;
			
 
				+			wsd->per_worker[i].fifo.exp_len -= task->predicted;
			
 
				+			wsd->per_worker[i].fifo.exp_start = now + task->predicted;
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
 
				-		wsd->fifos[i].exp_len = 0.0;
			
 
				+		wsd->per_worker[i].fifo.exp_len = 0.0;
			
 
				 
			
 
				 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 	if(task)
			
@@ -163,7 +167,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 
				 	if(task)
			
 
				 	{
			
 
				 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				-		wsd->fifos[i].nprocessed++;
			
 
				+		wsd->per_worker[i].fifo.nprocessed++;
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 
			
 
				 		return task;
			
@@ -196,9 +200,9 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 
				 	for(i = 0; i < component->nchildren; i++)
			
 
				 	{
			
 
				 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				-		sum_len += wsd->fifos[i].exp_len;
			
 
				-		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
			
 
				-		sum_start += wsd->fifos[i].exp_start;
			
 
				+		sum_len += wsd->per_worker[i].fifo.exp_len;
			
 
				+		wsd->per_worker[i].fifo.exp_start = STARPU_MAX(now, wsd->per_worker[i].fifo.exp_start);
			
 
				+		sum_start += wsd->per_worker[i].fifo.exp_start;
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 
			
 
				 	}
			
@@ -216,7 +220,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 
				 	for(i = 0; i < component->nchildren; i++)
			
 
				 	{
			
 
				 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				-		ntasks += wsd->fifos[i].ntasks;
			
 
				+		ntasks += wsd->per_worker[i].fifo.ntasks;
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 	}
			
 
				 	double speedup = 0.0;
			
@@ -265,7 +269,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
				 
			
 
				 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				 	starpu_sched_task_break(task);
			
 
				-	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
			
 
				+	ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo, task);
			
 
				 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 
			
 
				 	wsd->last_push_child = i;
			
@@ -308,9 +312,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
				 
			
 
				 			struct _starpu_component_work_stealing_data * wsd = component->data;
			
 
				 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
			
 
				-			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
			
 
				+			int ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo , task);
			
 
				 			if(ret == 0 && !isnan(task->predicted))
			
 
				-				wsd->fifos[i].exp_len += task->predicted;
			
 
				+				wsd->per_worker[i].fifo.exp_len += task->predicted;
			
 
				 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
			
 
				 
			
 
				 			component->can_pull(component);
			
@@ -329,12 +333,13 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 
				 	if(wsd->size < component->nchildren)
			
 
				 	{
			
 
				 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
			
 
				-		_STARPU_REALLOC(wsd->fifos, component->nchildren * sizeof(*wsd->fifos));
			
 
				+		_STARPU_REALLOC(wsd->per_worker, component->nchildren * sizeof(*wsd->per_worker));
			
 
				 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
			
 
				 		wsd->size = component->nchildren;
			
 
				 	}
			
 
				 
			
 
				-	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
			
 
				+	wsd->per_worker[component->nchildren - 1].last_pop_child = 0;
			
 
				+	_starpu_prio_deque_init(&wsd->per_worker[component->nchildren - 1].fifo);
			
 
				 
			
 
				 	starpu_pthread_mutex_t *mutex;
			
 
				 	_STARPU_MALLOC(mutex, sizeof(*mutex));
			
@@ -356,8 +361,8 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 
				 			break;
			
 
				 	}
			
 
				 	STARPU_ASSERT(i_component != component->nchildren);
			
 
				-	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
			
 
				-	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
			
 
				+	struct _starpu_prio_deque tmp_fifo = wsd->per_worker[i_component].fifo;
			
 
				+	wsd->per_worker[i_component].fifo = wsd->per_worker[component->nchildren - 1].fifo;
			
 
				 
			
 
				 
			
 
				 	component->children[i_component] = component->children[component->nchildren - 1];
			
@@ -372,7 +377,7 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 
				 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
			
 
				 {
			
 
				 	struct _starpu_component_work_stealing_data * wsd = component->data;
			
 
				-	free(wsd->fifos);
			
 
				+	free(wsd->per_worker);
			
 
				 	free(wsd->mutexes);
			
 
				 	free(wsd);
			
 
				 }
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -71,10 +71,21 @@ struct locality_entry
 
				 
			
 
				 struct _starpu_work_stealing_data_per_worker
			
 
				 {
			
 
				+	char fill1[STARPU_CACHELINE_SIZE];
			
 
				+	/* This is read-mostly, only updated when the queue becomes empty or
			
 
				+	 * becomes non-empty, to make it generally cheap to check */
			
 
				+	unsigned notask;	/* whether the queue is empty */
			
 
				+	char fill2[STARPU_CACHELINE_SIZE];
			
 
				+
			
 
				 	struct _starpu_prio_deque queue;
			
 
				 	int running;
			
 
				 	int *proxlist;
			
 
				-	int busy;
			
 
				+	int busy;	/* Whether this worker is working on a task */
			
 
				+
			
 
				+	/* keep track of the work performed from the beginning of the algorithm to make
			
 
				+	 * better decisions about which queue to select when deferring work
			
 
				+	 */
			
 
				+	unsigned last_pop_worker;
			
 
				 
			
 
				 #ifdef USE_LOCALITY_TASKS
			
 
				 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
			
@@ -93,9 +104,8 @@ struct _starpu_work_stealing_data
 
				 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
			
 
				 	struct _starpu_work_stealing_data_per_worker *per_worker;
			
 
				 	/* keep track of the work performed from the beginning of the algorithm to make
			
 
				-	 * better decisions about which queue to select when stealing or deferring work
			
 
				+	 * better decisions about which queue to select when deferring work
			
 
				 	 */
			
 
				-	unsigned last_pop_worker;
			
 
				 	unsigned last_push_worker;
			
 
				 };
			
 
				 
			
@@ -118,7 +128,8 @@ static int calibration_value = 0;
 
				  */
			
 
				 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	unsigned worker = ws->last_pop_worker;
			
 
				+	unsigned workerid = starpu_worker_get_id_check();
			
 
				+	unsigned worker = ws->per_worker[workerid].last_pop_worker;
			
 
				 	unsigned nworkers;
			
 
				 	int *workerids = NULL;
			
 
				 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
			
@@ -131,14 +142,17 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 
				 		/* Here helgrind would shout that this is unprotected, but we
			
 
				 		 * are fine with getting outdated values, this is just an
			
 
				 		 * estimation */
			
 
				-		ntasks = ws->per_worker[workerids[worker]].queue.ntasks;
			
 
				-
			
 
				-		if (ntasks && (ws->per_worker[workerids[worker]].busy
			
 
				-					   || starpu_worker_is_blocked_in_parallel(workerids[worker])))
			
 
				-			break;
			
 
				+		if (!ws->per_worker[workerids[worker]].notask)
			
 
				+		{
			
 
				+			if (ws->per_worker[workerids[worker]].busy
			
 
				+						   || starpu_worker_is_blocked_in_parallel(workerids[worker])) {
			
 
				+				ntasks = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				 
			
 
				 		worker = (worker + 1) % nworkers;
			
 
				-		if (worker == ws->last_pop_worker)
			
 
				+		if (worker == ws->per_worker[workerid].last_pop_worker)
			
 
				 		{
			
 
				 			/* We got back to the first worker,
			
 
				 			 * don't go in infinite loop */
			
@@ -147,7 +161,7 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	ws->last_pop_worker = (worker + 1) % nworkers;
			
 
				+	ws->per_worker[workerid].last_pop_worker = (worker + 1) % nworkers;
			
 
				 
			
 
				 	worker = workerids[worker];
			
 
				 
			
@@ -327,15 +341,31 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 
				 	{
			
 
				 		/* found an interesting task, try to pick it! */
			
 
				 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
			
 
				+		{
			
 
				+			if (!data_source->queue.ntasks)
			
 
				+			{
			
 
				+				STARPU_ASSERT(ws->per_worker[source].notask == 0);
			
 
				+				ws->per_worker[source].notask = 1;
			
 
				+			}
			
 
				 			return best_task;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* Didn't find an interesting task, or couldn't run it :( */
			
 
				 	int skipped;
			
 
				+	struct starpu_task *task;
			
 
				+
			
 
				 	if (source != target)
			
 
				-		return _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
			
 
				+		task = _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
			
 
				 	else
			
 
				-		return _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
			
 
				+		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
			
 
				+
			
 
				+	if (task && !data_source->queue.ntasks)
			
 
				+	{
			
 
				+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
			
 
				+		ws->per_worker[source].notask = 1;
			
 
				+	}
			
 
				+	return task;
			
 
				 }
			
 
				 
			
 
				 /* Called when popping a task from a queue */
			
@@ -371,10 +401,18 @@ static void locality_pushed_task(struct _starpu_work_stealing_data *ws STARPU_AT
 
				 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
			
 
				 {
			
 
				 	int skipped;
			
 
				+	struct starpu_task *task;
			
 
				 	if (source != target)
			
 
				-		return _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
			
 
				+		task = _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
			
 
				 	else
			
 
				-		return _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
			
 
				+		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
			
 
				+
			
 
				+	if (task && !ws->per_worker[source].queue.ntasks)
			
 
				+	{
			
 
				+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
			
 
				+		ws->per_worker[source].notask = 1;
			
 
				+	}
			
 
				+	return task;
			
 
				 }
			
 
				 /* Called when popping a task from a queue */
			
 
				 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
			
@@ -530,7 +568,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
				 	struct starpu_task *task = NULL;
			
 
				 	unsigned workerid = starpu_worker_get_id_check();
			
 
				 
			
 
				-	ws->per_worker[workerid].busy = 0;
			
 
				+	if (ws->per_worker[workerid].busy)
			
 
				+		ws->per_worker[workerid].busy = 0;
			
 
				 
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
			
@@ -617,7 +656,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
				 		if (!task)
			
 
				 			return NULL;
			
 
				 	}
			
 
				-	ws->per_worker[workerid].busy = !!task;
			
 
				+	if (ws->per_worker[workerid].busy != !!task)
			
 
				+		ws->per_worker[workerid].busy = !!task;
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -648,6 +688,11 @@ int ws_push_task(struct starpu_task *task)
 
				 	record_data_locality(task, workerid);
			
 
				 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
			
 
				 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
			
 
				+	if (ws->per_worker[workerid].queue.ntasks == 1)
			
 
				+	{
			
 
				+		STARPU_ASSERT(ws->per_worker[workerid].notask == 1);
			
 
				+		ws->per_worker[workerid].notask = 0;
			
 
				+	}
			
 
				 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
			
 
				 
			
 
				 	starpu_push_task_end(task);
			
@@ -676,10 +721,12 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 
				 		int workerid = workerids[i];
			
 
				 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
			
 
				+		ws->per_worker[workerid].notask = 1;
			
 
				 		ws->per_worker[workerid].running = 1;
			
 
				 
			
 
				 		/* Tell helgrind that we are fine with getting outdated values,
			
 
				 		 * this is just an estimation */
			
 
				+		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].notask);
			
 
				 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
			
 
				 		ws->per_worker[workerid].busy = 0;
			
 
				 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
			
@@ -708,9 +755,7 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
				 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
			
 
				 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
			
 
				 
			
 
				-	ws->last_pop_worker = 0;
			
 
				 	ws->last_push_worker = 0;
			
 
				-	STARPU_HG_DISABLE_CHECKING(ws->last_pop_worker);
			
 
				 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
			
 
				 	ws->select_victim = select_victim;
			
 
				 
			
@@ -760,11 +805,12 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		int neighbor = ws->per_worker[workerid].proxlist[i];
			
 
				+		if (ws->per_worker[neighbor].notask)
			
 
				+			continue;
			
 
				                 /* FIXME: do not keep looking again and again at some worker
			
 
				                  * which has tasks, but that can't execute on me */
			
 
				-		int ntasks = ws->per_worker[neighbor].queue.ntasks;
			
 
				-		if (ntasks && (ws->per_worker[neighbor].busy
			
 
				-					   || starpu_worker_is_blocked_in_parallel(neighbor)))
			
 
				+		if (ws->per_worker[neighbor].busy
			
 
				+					   || starpu_worker_is_blocked_in_parallel(neighbor))
			
 
				 			return neighbor;
			
 
				 	}
			
 
				 	return -1;
			
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c
@@ -107,8 +107,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-	/* This method only work on CPU, CUDA, OPENCL */
			
 
				-	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
			
 
				+	STARPU_ASSERT_MSG((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0, "This function is implemented only on CPU, CUDA, OpenCL");
			
 
				 
			
 
				 	/* create a wrapper codelet */
			
 
				 	struct starpu_codelet wrapper_cl =
			
--- a/src/util/starpu_data_cpy.c
+++ b/src/util/starpu_data_cpy.c
@@ -159,6 +159,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 
				 	task->callback_func = callback_func;
			
 
				 	task->callback_arg = callback_arg;
			
 
				 
			
 
				+	/* FIXME: priority!! */
			
 
				 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
			
 
				 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
			
 
				 
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -43,6 +43,7 @@ EXTRA_DIST =					\
 
				 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
			
 
				 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
			
 
				 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
			
 
				+	microbenchs/bandwidth_scheds.sh		\
			
 
				 	energy/static.sh			\
			
 
				 	energy/dynamic.sh			\
			
 
				 	energy/perfs.gp				\
			
@@ -73,7 +74,7 @@ EXTRA_DIST =					\
 
				 	model-checking/starpu-mc.sh.in
			
 
				 
			
 
				 CLEANFILES = 					\
			
 
				-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90
			
 
				+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat bandwidth.gp bandwidth.eps bandwidth.svg
			
 
				 
			
 
				 BUILT_SOURCES =
			
 
				 SUBDIRS =
			
@@ -268,6 +269,7 @@ myPROGRAMS +=				\
 
				 	datawizard/acquire_cb_insert		\
			
 
				 	datawizard/acquire_release		\
			
 
				 	datawizard/acquire_release2		\
			
 
				+	datawizard/acquire_release_to		\
			
 
				 	datawizard/acquire_try			\
			
 
				 	datawizard/bcsr				\
			
 
				 	datawizard/cache			\
			
@@ -355,6 +357,7 @@ myPROGRAMS +=				\
 
				 	microbenchs/prefetch_data_on_node 	\
			
 
				 	microbenchs/redundant_buffer		\
			
 
				 	microbenchs/matrix_as_vector		\
			
 
				+	microbenchs/bandwidth			\
			
 
				 	overlap/gpu_concurrency			\
			
 
				 	parallel_tasks/explicit_combined_worker	\
			
 
				 	parallel_tasks/parallel_kernels		\
			
@@ -431,6 +434,8 @@ examplebin_SCRIPTS = \
 
				 	microbenchs/tasks_size_overhead.sh
			
 
				 if !STARPU_SIMGRID
			
 
				 if !STARPU_USE_MPI_MASTER_SLAVE
			
 
				+examplebin_PROGRAMS += \
			
 
				+	microbenchs/bandwidth
			
 
				 SHELL_TESTS += \
			
 
				 	microbenchs/tasks_data_overhead.sh \
			
 
				 	microbenchs/sync_tasks_data_overhead.sh \
			
@@ -454,6 +459,7 @@ endif
 
				 if !STARPU_USE_MPI_MASTER_SLAVE
			
 
				 SHELL_TESTS += \
			
 
				 	datawizard/locality.sh \
			
 
				+	microbenchs/bandwidth_scheds.sh \
			
 
				 	overlap/overlap.sh
			
 
				 endif
			
 
				 
			
@@ -505,6 +511,17 @@ datawizard_acquire_release2_SOURCES +=		\
 
				 	datawizard/acquire_release_opencl.c
			
 
				 endif
			
 
				 
			
 
				+datawizard_acquire_release_to_SOURCES =		\
			
 
				+	datawizard/acquire_release_to.c
			
 
				+if STARPU_USE_CUDA
			
 
				+datawizard_acquire_release_to_SOURCES +=		\
			
 
				+	datawizard/acquire_release_cuda.cu
			
 
				+endif
			
 
				+if STARPU_USE_OPENCL
			
 
				+datawizard_acquire_release_to_SOURCES +=		\
			
 
				+	datawizard/acquire_release_opencl.c
			
 
				+endif
			
 
				+
			
 
				 datawizard_scratch_SOURCES =			\
			
 
				 	datawizard/scratch.c
			
 
				 if STARPU_USE_CUDA
			
--- a/tests/datawizard/acquire_release_to.c
+++ b/tests/datawizard/acquire_release_to.c
@@ -0,0 +1,214 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * Check that _release_to correctly interacts with tasks working on the same data
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+static unsigned ntasks = 10;
			
 
				+#elif !defined(STARPU_LONG_CHECK)
			
 
				+static unsigned ntasks = 1000;
			
 
				+#else
			
 
				+static unsigned ntasks = 10000;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void increment_cuda(void *descr[], void *_args);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void increment_opencl(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+void increment_cpu(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	(*tokenptr)++;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.cpu_funcs = {increment_cpu},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_funcs = {increment_opencl},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				+#endif
			
 
				+	.cpu_funcs_name = {"increment_cpu"},
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+void check_cpu(void *descr[], void *arg)
			
 
				+{
			
 
				+	unsigned *val = arg;
			
 
				+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	STARPU_ASSERT(*tokenptr == *val);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet check_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_R },
			
 
				+	.cpu_funcs = {check_cpu},
			
 
				+	.cpu_funcs_name = {"check_cpu"},
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+unsigned token = 0;
			
 
				+starpu_data_handle_t token_handle;
			
 
				+
			
 
				+static
			
 
				+int increment_token(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &increment_cl;
			
 
				+	task->handles[0] = token_handle;
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int check_token(unsigned value)
			
 
				+{
			
 
				+	unsigned *value_p;
			
 
				+	int ret;
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &check_cl;
			
 
				+	task->handles[0] = token_handle;
			
 
				+	task->cl_arg = value_p = malloc(sizeof(*value_p));
			
 
				+	task->cl_arg_size = sizeof(*value_p);
			
 
				+	task->cl_arg_free = 1;
			
 
				+	*value_p = value;
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void callback(void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	token++;
			
 
				+	starpu_data_release_to(token_handle, STARPU_W);
			
 
				+	starpu_sleep(0.001);
			
 
				+	starpu_data_release_to(token_handle, STARPU_R);
			
 
				+	starpu_sleep(0.001);
			
 
				+	starpu_data_release(token_handle);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program opencl_program;
			
 
				+#endif
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	int ret;
			
 
				+
			
 
				+        ret = starpu_initialize(NULL, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
			
 
				+						  &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
			
 
				+
			
 
				+        FPRINTF(stderr, "Token: %u\n", token);
			
 
				+
			
 
				+	for(i=0; i<ntasks; i++)
			
 
				+	{
			
 
				+		/* synchronize data in RAM */
			
 
				+                ret = starpu_data_acquire(token_handle, STARPU_RW);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+
			
 
				+                token ++;
			
 
				+
			
 
				+		ret = check_token(4*i+1);
			
 
				+		if (ret == -ENODEV) goto enodev_release;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		ret = increment_token();
			
 
				+		if (ret == -ENODEV) goto enodev_release;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		ret = check_token(4*i+2);
			
 
				+		if (ret == -ENODEV) goto enodev_release;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		starpu_sleep(0.001);
			
 
				+		starpu_data_release_to(token_handle, STARPU_W);
			
 
				+
			
 
				+		starpu_sleep(0.001);
			
 
				+		starpu_data_release_to(token_handle, STARPU_R);
			
 
				+
			
 
				+		starpu_sleep(0.001);
			
 
				+		starpu_data_release(token_handle);
			
 
				+
			
 
				+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
			
 
				+
			
 
				+		ret = check_token(4*i+3);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		ret = increment_token();
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		ret = check_token(4*i+4);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+        FPRINTF(stderr, "Token: %u\n", token);
			
 
				+	if (token == ntasks * 4)
			
 
				+		ret = EXIT_SUCCESS;
			
 
				+	else
			
 
				+		ret = EXIT_FAILURE;
			
 
				+	return ret;
			
 
				+
			
 
				+enodev_release:
			
 
				+	starpu_data_release(token_handle);
			
 
				+enodev:
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        ret = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
--- a/tests/datawizard/bcsr.c
+++ b/tests/datawizard/bcsr.c
@@ -39,20 +39,20 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				-	printf("\nnnz %d elemsize %d\n", nnz, elemsize);
			
 
				+	printf("\nnnz %u elemsize %u\n", nnz, elemsize);
			
 
				 
			
 
				 	for (i = 0; i < nrow; i++)
			
 
				 	{
			
 
				 		uint32_t row_start = rowptr[i] - firstentry;
			
 
				 		uint32_t row_end = rowptr[i+1] - firstentry;
			
 
				 
			
 
				-		printf("row %d\n", i);
			
 
				+		printf("row %u\n", i);
			
 
				 
			
 
				 		for (j = row_start; j < row_end; j++)
			
 
				 		{
			
 
				 			int *block = nzval + j * r*c;
			
 
				 
			
 
				-			printf( " column %d\n", colind[j]);
			
 
				+			printf( " column %u\n", colind[j]);
			
 
				 
			
 
				 			for (y = 0; y < r; y++)
			
 
				 			{
			
--- a/tests/datawizard/interfaces/block/block_opencl.c
+++ b/tests/datawizard/interfaces/block/block_opencl.c
@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 
				 	}
			
 
				 			
			
 
				 	{
			
 
				-		size_t global = nx * ny * nz;
			
 
				+		size_t global[3] = {nx, ny, nz};
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					     kernel,
			
 
				-					     1,
			
 
				+					     3,
			
 
				 					     NULL,
			
 
				-					     &global,
			
 
				+					     global,
			
 
				 					     NULL,
			
 
				 					     0,
			
 
				 					     NULL,
			
--- a/tests/datawizard/interfaces/block/block_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/block/block_opencl_kernel.cl
@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 
				 			   int ldy, int ldz,
			
 
				 			   int factor, __global int *err)
			
 
				 {
			
 
				-        const int id = get_global_id(0);
			
 
				-	if (id > 0)
			
 
				+	const int idx = get_global_id(0);
			
 
				+	const int idy = get_global_id(1);
			
 
				+	const int idz = get_global_id(2);
			
 
				+	if (idx >= nx)
			
 
				 		return;
			
 
				+	if (idy >= ny)
			
 
				+		return;
			
 
				+	if (idz >= nz)
			
 
				+		return;
			
 
				+
			
 
				+	int val = idz*ny*nx+idy*nx+idx;
			
 
				+	int i = (idz*ldz)+(idy*ldy)+idx;
			
 
				 
			
 
				-	unsigned int i, j, k;
			
 
				-	int val = 0;
			
 
				-	for (k = 0; k < nz; k++)
			
 
				-	{
			
 
				-		for (j = 0; j < ny; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < nx; i++)
			
 
				-			{
			
 
				-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
			
 
				-				{
			
 
				-					*err = 1;
			
 
				-					return;
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					block[(k*ldz)+(j*ldy)+i] *= -1;
			
 
				-					val++;
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				+	if (block[i] != factor * val)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		block[i] *= -1;
			
 
				 }
			
--- a/tests/datawizard/interfaces/tensor/tensor_interface.c
+++ b/tests/datawizard/interfaces/tensor/tensor_interface.c
@@ -18,7 +18,7 @@
 
				 #include "../test_interfaces.h"
			
 
				 #include "../../../helper.h"
			
 
				 
			
 
				-#define NX 16
			
 
				+#define NX 4
			
 
				 #define NY NX
			
 
				 #define NZ NX
			
 
				 #define NT NX
			
--- a/tests/datawizard/interfaces/tensor/tensor_opencl.c
+++ b/tests/datawizard/interfaces/tensor/tensor_opencl.c
@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 
				 	}
			
 
				 			
			
 
				 	{
			
 
				-                size_t global = 1;
			
 
				+		size_t global[3] = {nx, ny, nz*nt};
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					     kernel,
			
 
				-					     1,
			
 
				+					     3,
			
 
				 					     NULL,
			
 
				-					     &global,
			
 
				+					     global,
			
 
				 					     NULL,
			
 
				 					     0,
			
 
				 					     NULL,
			
--- a/tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 
				 			   int ldy, int ldz, int ldt,
			
 
				 			   int factor, __global int *err)
			
 
				 {
			
 
				-        const int id = get_global_id(0);
			
 
				-	if (id > 0)
			
 
				+	const int idx = get_global_id(0);
			
 
				+	const int idy = get_global_id(1);
			
 
				+	const int idz = get_global_id(2) % nz;
			
 
				+	const int idt = get_global_id(2) / nz;
			
 
				+	if (idx >= nx)
			
 
				 		return;
			
 
				+	if (idy >= ny)
			
 
				+		return;
			
 
				+	if (idz >= nz)
			
 
				+		return;
			
 
				+	if (idt >= nt)
			
 
				+		return;
			
 
				+
			
 
				+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
			
 
				+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
			
 
				 
			
 
				-	unsigned int i, j, k, l;
			
 
				-	int val = 0;
			
 
				-	for (l = 0; l < nt; l++)
			
 
				-	{
			
 
				-	    for (k = 0; k < nz; k++)
			
 
				-	    {
			
 
				-		for (j = 0; j < ny; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < nx; i++)
			
 
				-			{
			
 
				-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
			
 
				-				{
			
 
				-					*err = 1;
			
 
				-					return;
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
			
 
				-					val++;
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-	    }
			
 
				-	}
			
 
				+	if (tensor[i] != factor * val)
			
 
				+		*err = 1;
			
 
				+	else
			
 
				+		tensor[i] *= -1;
			
 
				 }
			
--- a/tests/energy/energy_efficiency.c
+++ b/tests/energy/energy_efficiency.c
@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
				 
			
 
				 	unsigned N, k, m, n, iter, NITER;
			
 
				 	if (argc < 2)
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+		N = 10;
			
 
				+#else
			
 
				 		N = 40;
			
 
				+#endif
			
 
				 	else
			
 
				 		N = atoi(argv[1]);
			
 
				 	if (argc < 3)
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+		NITER = 3;
			
 
				+#else
			
 
				 		NITER = 10;
			
 
				+#endif
			
 
				 	else
			
 
				 		NITER = atoi(argv[2]);
			
 
				 	if (N == 0)
			
--- a/tests/helper/starpu_data_dup_ro.c
+++ b/tests/helper/starpu_data_dup_ro.c
@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 
				 	ret = EXIT_SUCCESS;
			
 
				 	if (*var != 42)
			
 
				 	{
			
 
				-	     FPRINTF(stderr, "var2 is %d but it should be %d\n", *var, 42);
			
 
				+	     FPRINTF(stderr, "var2 is %u but it should be %d\n", *var, 42);
			
 
				 	     ret = EXIT_FAILURE;
			
 
				 	}
			
 
				 	starpu_data_release(var2_handle);
			
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
				 	var = starpu_data_get_local_ptr(var3_handle);
			
 
				 	if (*var != 42)
			
 
				 	{
			
 
				-	     FPRINTF(stderr, "var3 is %d but it should be %d\n", *var, 42);
			
 
				+	     FPRINTF(stderr, "var3 is %u but it should be %d\n", *var, 42);
			
 
				 	     ret = EXIT_FAILURE;
			
 
				 	}
			
 
				 	starpu_data_release(var3_handle);
			
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 
				 	var = starpu_data_get_local_ptr(var4_handle);
			
 
				 	if (*var != 42)
			
 
				 	{
			
 
				-	     FPRINTF(stderr, "var4 is %d but it should be %d\n", *var, 42);
			
 
				+	     FPRINTF(stderr, "var4 is %u but it should be %d\n", *var, 42);
			
 
				 	     ret = EXIT_FAILURE;
			
 
				 	}
			
 
				 	starpu_data_release(var4_handle);
			
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 
				 	var = starpu_data_get_local_ptr(var5_handle);
			
 
				 	if (*var != 43)
			
 
				 	{
			
 
				-	     FPRINTF(stderr, "var5 is %d but it should be %d\n", *var, 43);
			
 
				+	     FPRINTF(stderr, "var5 is %u but it should be %d\n", *var, 43);
			
 
				 	     ret = EXIT_FAILURE;
			
 
				 	}
			
 
				 	starpu_data_release(var5_handle);
			
--- a/tests/microbenchs/bandwidth.c
+++ b/tests/microbenchs/bandwidth.c
@@ -0,0 +1,343 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * Measure the memory bandwidth available to kernels depending on the number of
			
 
				+ * kernels and number of idle workers.
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+static size_t size = 1024;
			
 
				+static unsigned cpustep = 4;
			
 
				+#else
			
 
				+/* Must be bigger than available cache size per core, 64MiB should be enough */
			
 
				+static size_t size = 64UL << 20;
			
 
				+static unsigned cpustep = 1;
			
 
				+#endif
			
 
				+
			
 
				+static unsigned noalone = 0;
			
 
				+static unsigned iter = 30;
			
 
				+static unsigned total_ncpus;
			
 
				+static starpu_pthread_barrier_t barrier_begin, barrier_end;
			
 
				+static float *result;
			
 
				+static void **buffers;	/* Indexed by logical core number */
			
 
				+static char padding1[STARPU_CACHELINE_SIZE];
			
 
				+static volatile char finished;
			
 
				+static char padding2[STARPU_CACHELINE_SIZE];
			
 
				+
			
 
				+static unsigned interleave(unsigned i);
			
 
				+
			
 
				+/* Initialize the buffer locally */
			
 
				+void initialize_buffer(void *foo)
			
 
				+{
			
 
				+	unsigned id = starpu_worker_get_id();
			
 
				+#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
			
 
				+	STARPU_ASSERT(ret == 0);
			
 
				+#else
			
 
				+	buffers[id] = malloc(2*size);
			
 
				+#endif
			
 
				+	memset(buffers[id], 0, 2*size);
			
 
				+}
			
 
				+
			
 
				+/* Actual transfer codelet */
			
 
				+void bw_func(void *descr[], void *arg)
			
 
				+{
			
 
				+	int id = (uintptr_t) arg;
			
 
				+	void *src = buffers[id];
			
 
				+	void *dst = (void*) ((uintptr_t)src + size);
			
 
				+	unsigned i;
			
 
				+	double start, stop;
			
 
				+
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
			
 
				+	start = starpu_timing_now();
			
 
				+	for (i = 0; i < iter; i++)
			
 
				+	{
			
 
				+		memcpy(dst, src, size);
			
 
				+		STARPU_SYNCHRONIZE();
			
 
				+	}
			
 
				+	stop = starpu_timing_now();
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
			
 
				+	finished = 1;
			
 
				+
			
 
				+	result[id] = (size*iter) / (stop - start);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet bw_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {bw_func},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+};
			
 
				+
			
 
				+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
			
 
				+void nop_func(void *descr[], void *arg)
			
 
				+{
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
			
 
				+	while (!finished)
			
 
				+	{
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < 1000000; i++)
			
 
				+			STARPU_UYIELD();
			
 
				+		STARPU_SYNCHRONIZE();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet nop_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {nop_func},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+};
			
 
				+
			
 
				+/* Codelet that waits for completion while aggressively reading the finished variable. */
			
 
				+void sync_func(void *descr[], void *arg)
			
 
				+{
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
			
 
				+	while (!finished)
			
 
				+	{
			
 
				+		STARPU_VALGRIND_YIELD();
			
 
				+		STARPU_SYNCHRONIZE();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet sync_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {sync_func},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+};
			
 
				+
			
 
				+static void usage(char **argv)
			
 
				+{
			
 
				+	fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
			
 
				+	fprintf(stderr, "\t-n niter\tNumber of iterations\n");
			
 
				+	fprintf(stderr, "\t-s size\tBuffer size in MB\n");
			
 
				+	fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
			
 
				+	fprintf(stderr, "\t-a Do not run the alone test\n");
			
 
				+	exit(EXIT_FAILURE);
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int c;
			
 
				+	while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
			
 
				+	switch(c)
			
 
				+	{
			
 
				+		case 'n':
			
 
				+			iter = atoi(optarg);
			
 
				+			break;
			
 
				+		case 's':
			
 
				+			size = (long)atoi(optarg) << 20;
			
 
				+			break;
			
 
				+		case 'c':
			
 
				+			cpustep = atoi(optarg);
			
 
				+			break;
			
 
				+		case 'a':
			
 
				+			noalone = 1;
			
 
				+			break;
			
 
				+		case 'h':
			
 
				+			usage(argv);
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static unsigned interleave(unsigned i)
			
 
				+{
			
 
				+	/* TODO: rather distribute over hierarchy */
			
 
				+	if (total_ncpus > 1)
			
 
				+		return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
			
 
				+	else
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+enum sleep_type {
			
 
				+	PAUSE,
			
 
				+	NOP,
			
 
				+	SYNC,
			
 
				+	SCHED,
			
 
				+};
			
 
				+
			
 
				+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned i;
			
 
				+	struct starpu_conf conf;
			
 
				+	float bw;
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.precedence_over_environment_variables = 1;
			
 
				+	conf.ncuda = 0;
			
 
				+	conf.nopencl = 0;
			
 
				+	conf.nmic = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				+	conf.ncpus = ncpus;
			
 
				+
			
 
				+	if (intl && sleep == PAUSE)
			
 
				+	{
			
 
				+		conf.use_explicit_workers_bindid = 1;
			
 
				+		for (i = 0; i < ncpus; i++)
			
 
				+			conf.workers_bindid[i] = interleave(i);
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_initialize(&conf, argc, argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	if (sleep == PAUSE || sleep == SCHED)
			
 
				+		/* In these cases we don't have a task on each cpu */
			
 
				+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
			
 
				+	else
			
 
				+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
			
 
				+
			
 
				+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
			
 
				+
			
 
				+	finished = 0;
			
 
				+	for (i = 0; i < ncpus; i++)
			
 
				+		result[i] = NAN;
			
 
				+
			
 
				+	for (i = 0; i < nbusy; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl = &bw_codelet;
			
 
				+
			
 
				+		if (intl)
			
 
				+			task->cl_arg = (void*) (uintptr_t) interleave(i);
			
 
				+		else
			
 
				+			task->cl_arg = (void*) (uintptr_t) i;
			
 
				+
			
 
				+		task->execute_on_a_specific_worker = 1;
			
 
				+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
			
 
				+			task->workerid = interleave(i);
			
 
				+		else
			
 
				+			task->workerid = i;
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	if (sleep != PAUSE && sleep != SCHED)
			
 
				+	{
			
 
				+		/* Add waiting tasks */
			
 
				+		for ( ; i < ncpus; i++)
			
 
				+		{
			
 
				+			struct starpu_task *task = starpu_task_create();
			
 
				+			switch (sleep)
			
 
				+			{
			
 
				+			case NOP:
			
 
				+				task->cl = &nop_codelet;
			
 
				+				break;
			
 
				+			case SYNC:
			
 
				+				task->cl = &sync_codelet;
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+			}
			
 
				+			task->execute_on_a_specific_worker = 1;
			
 
				+			task->workerid = interleave(i);
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	for (bw = 0., i = 0; i < nbusy; i++)
			
 
				+	{
			
 
				+		if (intl)
			
 
				+			bw += result[interleave(i)];
			
 
				+		else
			
 
				+			bw += result[i];
			
 
				+	}
			
 
				+	return bw;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	unsigned n;
			
 
				+	struct starpu_conf conf;
			
 
				+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.precedence_over_environment_variables = 1;
			
 
				+	conf.ncuda = 0;
			
 
				+	conf.nopencl = 0;
			
 
				+	conf.nmic = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				+
			
 
				+	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	total_ncpus = starpu_cpu_worker_get_count();
			
 
				+
			
 
				+	buffers = malloc(total_ncpus * sizeof(*buffers));
			
 
				+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (total_ncpus == 0)
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				+	result = malloc(total_ncpus * sizeof(result[0]));
			
 
				+
			
 
				+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
			
 
				+	for (n = cpustep; n <= total_ncpus; n += cpustep)
			
 
				+	{
			
 
				+		if (noalone)
			
 
				+		{
			
 
				+			alone = 0.;
			
 
				+			alone_int = 0.;
			
 
				+			alone_int_nop = 0.;
			
 
				+			alone_int_sync = 0.;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
			
 
				+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
			
 
				+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
			
 
				+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
			
 
				+		}
			
 
				+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
			
 
				+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
			
 
				+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
			
 
				+				n,
			
 
				+				alone/1000,
			
 
				+				sched/1000, sched*100/alone,
			
 
				+				alone_int/1000,
			
 
				+				alone_int_nop/1000,
			
 
				+				alone_int_sync/1000,
			
 
				+				sched_int/1000, sched_int*100/alone_int_nop);
			
 
				+		fflush(stdout);
			
 
				+	}
			
 
				+
			
 
				+	free(result);
			
 
				+
			
 
				+	for (n = 0; n < total_ncpus; n++)
			
 
				+		free(buffers[n]);
			
 
				+	free(buffers);
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tests/microbenchs/bandwidth_scheds.sh
+++ b/tests/microbenchs/bandwidth_scheds.sh
@@ -0,0 +1,75 @@
 
				+#!/bin/bash
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+if [ -n "$STARPU_SCHED" ]
			
 
				+then
			
 
				+	SCHEDS=$STARPU_SCHED
			
 
				+	DEFAULT=$STARPU_SCHED
			
 
				+else
			
 
				+	SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
			
 
				+	DEFAULT=eager
			
 
				+fi
			
 
				+
			
 
				+if [ -n "$STARPU_BENCH_DIR" ]; then
			
 
				+	cat > bandwidth.gp << EOF
			
 
				+set term svg font ",12" size 1500,500 linewidth 0.5
			
 
				+set output "bandwidth.svg"
			
 
				+set pointsize 0.3
			
 
				+EOF
			
 
				+else
			
 
				+	fast="-n 3 -c 4"
			
 
				+	cat > bandwidth.gp << EOF
			
 
				+set term postscript eps enhanced color font ",18"
			
 
				+set output "bandwidth.eps"
			
 
				+set size 2,1
			
 
				+EOF
			
 
				+fi
			
 
				+
			
 
				+cat >> bandwidth.gp << EOF
			
 
				+set key outside
			
 
				+set ylabel "GB/s"
			
 
				+set xlabel "ncores"
			
 
				+
			
 
				+plot \\
			
 
				+	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
			
 
				+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
			
 
				+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
			
 
				+	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone contiguous", \\
			
 
				+EOF
			
 
				+
			
 
				+type=1
			
 
				+for sched in $SCHEDS
			
 
				+do
			
 
				+	if [ "$sched" != eager -a "$sched" != "$SCHEDS" ]; then
			
 
				+		extra=-a
			
 
				+	else
			
 
				+		extra=
			
 
				+	fi
			
 
				+
			
 
				+	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $fast $extra "$@" | tee bandwidth-$sched.dat
			
 
				+	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
			
 
				+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
			
 
				+	type=$((type+1))
			
 
				+done
			
 
				+
			
 
				+if gnuplot bandwidth.gp ; then
			
 
				+	if [ -n "$STARPU_BENCH_DIR" ]; then
			
 
				+		cp bandwidth.svg $STARPU_BENCH_DIR/
			
 
				+	fi
			
 
				+fi
			
--- a/tests/overlap/overlap.c
+++ b/tests/overlap/overlap.c
@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 
				 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
 
				+	starpu_do_schedule();
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 	if (!finished)
			
 
				 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
--- a/tests/parallel_tasks/explicit_combined_worker.c
+++ b/tests/parallel_tasks/explicit_combined_worker.c
@@ -114,7 +114,7 @@ int main(void)
 
				 enodev:
			
 
				 	starpu_data_unregister(v_handle);
			
 
				 	starpu_free(v);
			
 
				-	fprintf(stderr, "WARNING: No one can execute the task on workerid %d\n", worker);
			
 
				+	fprintf(stderr, "WARNING: No one can execute the task on workerid %u\n", worker);
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				 	starpu_shutdown();
			
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -285,6 +285,7 @@ EXTRA_DIST =				\
 
				 	dev/cppcheck/suppressions.txt	\
			
 
				 	dev/valgrind/bash.suppr		\
			
 
				 	dev/valgrind/fxt.suppr		\
			
 
				+	dev/valgrind/glpk.suppr		\
			
 
				 	dev/valgrind/hdf5.suppr		\
			
 
				 	dev/valgrind/hwloc.suppr	\
			
 
				 	dev/valgrind/libc.suppr		\
			
--- a/tools/dev/valgrind/glpk.suppr
+++ b/tools/dev/valgrind/glpk.suppr
@@ -0,0 +1,23 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+{
			
 
				+   <insert_a_suppression_name_here>
			
 
				+   Memcheck:Leak
			
 
				+   match-leak-kinds: reachable
			
 
				+   ...
			
 
				+   fun:glp_init_env
			
 
				+}
			
--- a/tools/dev/valgrind/libc.suppr
+++ b/tools/dev/valgrind/libc.suppr
@@ -263,8 +263,7 @@
 
				    Memcheck:Leak
			
 
				    match-leak-kinds: reachable
			
 
				    fun:malloc
			
 
				-   fun:_dl_close_worker
			
 
				-   fun:_dl_close_worker
			
 
				+   ...
			
 
				    fun:_dl_close
			
 
				    fun:_dl_catch_exception
			
 
				    fun:_dl_catch_error
			
--- a/tools/dev/valgrind/padico.suppr
+++ b/tools/dev/valgrind/padico.suppr
@@ -110,7 +110,7 @@
 
				    Memcheck:Leak
			
 
				    match-leak-kinds: reachable
			
 
				    fun:malloc
			
 
				-   fun:_dl_close_worker
			
 
				+   ...
			
 
				    fun:_dl_close
			
 
				    fun:_dl_catch_error
			
 
				    fun:dlerror_run
			
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -39,33 +39,6 @@ define starpu-print-task
 
				   set $task = (struct starpu_task *)$arg0
			
 
				   set $job = (struct _starpu_job *)$task->starpu_private
			
 
				   set $status=0
			
 
				-  if $task->status == 0
			
 
				-    set $status="STARPU_TASK_INIT"
			
 
				-  end
			
 
				-  if $task->status == 1
			
 
				-    set $status="STARPU_TASK_BLOCKED"
			
 
				-  end
			
 
				-  if $task->status == 2
			
 
				-    set $status="STARPU_TASK_READY"
			
 
				-  end
			
 
				-  if $task->status == 3
			
 
				-    set $status="STARPU_TASK_RUNNING"
			
 
				-  end
			
 
				-  if $task->status == 4
			
 
				-    set $status="STARPU_TASK_FINISHED"
			
 
				-  end
			
 
				-  if $task->status == 5
			
 
				-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
			
 
				-  end
			
 
				-  if $task->status == 6
			
 
				-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
			
 
				-  end
			
 
				-  if $task->status == 7
			
 
				-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
			
 
				-  end
			
 
				-  if $task->status == 8
			
 
				-    set $status="STARPU_TASK_STOPPED"
			
 
				-  end
			
 
				 
			
 
				   printf "StarPU Task (%p)\n", $task
			
 
				   if $task->name
			
@@ -81,13 +54,42 @@ define starpu-print-task
 
				   end
			
 
				   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
			
 
				   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
			
 
				+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
			
 
				   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
			
 
				   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
			
 
				   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
			
 
				   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
			
 
				   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
			
 
				   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
			
 
				-  printf "\tstatus:\t\t\t\t<%s>\n", $status
			
 
				+  printf "\tstatus:\t\t\t\t"
			
 
				+  if $task->status == 0
			
 
				+    printf "STARPU_TASK_INIT"
			
 
				+  end
			
 
				+  if $task->status == 1
			
 
				+    printf "STARPU_TASK_BLOCKED"
			
 
				+  end
			
 
				+  if $task->status == 2
			
 
				+    printf "STARPU_TASK_READY"
			
 
				+  end
			
 
				+  if $task->status == 3
			
 
				+    printf "STARPU_TASK_RUNNING"
			
 
				+  end
			
 
				+  if $task->status == 4
			
 
				+    printf "STARPU_TASK_FINISHED"
			
 
				+  end
			
 
				+  if $task->status == 5
			
 
				+    printf "STARPU_TASK_BLOCKED_ON_TAG"
			
 
				+  end
			
 
				+  if $task->status == 6
			
 
				+    printf "STARPU_TASK_BLOCKED_ON_TASK"
			
 
				+  end
			
 
				+  if $task->status == 7
			
 
				+    printf "STARPU_TASK_BLOCKED_ON_DATA"
			
 
				+  end
			
 
				+  if $task->status == 8
			
 
				+    printf "STARPU_TASK_STOPPED"
			
 
				+  end
			
 
				+  printf "\n"
			
 
				   printf "\tjob:\t\t\t\t<%p>\n", $job
			
 
				   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
			
 
				   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
			
@@ -169,35 +171,36 @@ define starpu-workers
 
				   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
			
 
				   while $num<_starpu_config->topology->nworkers
			
 
				     set $worker=&_starpu_config->workers[$num]
			
 
				+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
			
 
				+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
			
 
				     if $worker->status == STATUS_INVALID
			
 
				-      set $status="INVALID"
			
 
				+      printf "INVALID"
			
 
				     end
			
 
				     if $worker->status == STATUS_UNKNOWN
			
 
				-      set $status="UNKNOWN"
			
 
				+      printf "UNKNOWN"
			
 
				     end
			
 
				     if $worker->status == STATUS_INITIALIZING
			
 
				-      set $status="INITIALIZING"
			
 
				+      printf "INITIALIZING"
			
 
				     end
			
 
				     if $worker->status == STATUS_EXECUTING
			
 
				-      set $status="EXECUTING"
			
 
				+      printf "EXECUTING"
			
 
				     end
			
 
				     if $worker->status == STATUS_CALLBACK
			
 
				-      set $status="CALLBACK"
			
 
				+      printf "CALLBACK"
			
 
				     end
			
 
				     if $worker->status == STATUS_SCHEDULING
			
 
				-      set $status="SCHEDULING"
			
 
				+      printf "SCHEDULING"
			
 
				     end
			
 
				     if $worker->status == STATUS_WAITING
			
 
				-      set $status="WAITING"
			
 
				+      printf "WAITING"
			
 
				     end
			
 
				     if $worker->status == STATUS_SLEEPING_SCHEDULING
			
 
				-      set $status="SLEEPING_SCHEDULING"
			
 
				+      printf "SLEEPING_SCHEDULING"
			
 
				     end
			
 
				     if $worker->status == STATUS_SLEEPING
			
 
				-      set $status="SLEEPING"
			
 
				+      printf "SLEEPING"
			
 
				     end
			
 
				-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
			
 
				-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
			
 
				+    printf "\n"
			
 
				     set $num = $num + 1
			
 
				   end
			
 
				 end
			
@@ -205,23 +208,24 @@ end
 
				 define starpu-print-tag
			
 
				   set language c
			
 
				   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
			
 
				+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
			
 
				+  printf "\tstate "
			
 
				   if $tag_struct->state == STARPU_INVALID_STATE
			
 
				-     set $status="STARPU_INVALID_STATE"
			
 
				+     printf "STARPU_INVALID_STATE"
			
 
				   end
			
 
				   if $tag_struct->state == STARPU_ASSOCIATED
			
 
				-     set $status="STARPU_ASSOCIATED"
			
 
				+     printf "STARPU_ASSOCIATED"
			
 
				   end
			
 
				   if $tag_struct->state == STARPU_BLOCKED
			
 
				-     set $status="STARPU_BLOCKED"
			
 
				+     printf "STARPU_BLOCKED"
			
 
				   end
			
 
				   if $tag_struct->state == STARPU_READY
			
 
				-     set $status="STARPU_READY"
			
 
				+     printf "STARPU_READY"
			
 
				   end
			
 
				   if $tag_struct->state == STARPU_DONE
			
 
				-     set $status="STARPU_DONE"
			
 
				+     printf "STARPU_DONE"
			
 
				   end
			
 
				-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
			
 
				-  printf "\tstate %s\n", $status
			
 
				+  printf "\n"
			
 
				   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
			
 
				   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
			
 
				   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
			
@@ -317,7 +321,11 @@ define starpu-all-tasks
 
				     while $l != &all_jobs_list
			
 
				       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
			
 
				       set $task = $j->task
			
 
				-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
			
 
				+      if $task->name
			
 
				+        printf "task %p %s\n", $task, $task->name
			
 
				+      else
			
 
				+        printf "task %p\n", $task
			
 
				+      end
			
 
				       set $l = $l->next
			
 
				     end
			
 
				   end
			
@@ -915,9 +923,9 @@ end
 
				 define starpu-sched-print-component
			
 
				     set $c = (struct starpu_sched_component *) $arg1
			
 
				     starpu-print-spaces $arg0
			
 
				-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
			
 
				+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
			
 
				     if $c->push_task == fifo_push_task
			
 
				-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
			
 
				+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
			
 
				       starpu-print-spaces $arg0
			
 
				       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
			
 
				     end
			
@@ -951,29 +959,29 @@ end
 
				 
			
 
				 define starpu-mpi-print-request
			
 
				     set $request = (struct _starpu_mpi_req *)$arg0
			
 
				-    set $request_type = "unknown_type"
			
 
				+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
			
 
				     if $request->request_type == SEND_REQ
			
 
				-       set $request_type = "SEND_REQ"
			
 
				+       printf "SEND_REQ"
			
 
				     end
			
 
				     if $request->request_type == RECV_REQ
			
 
				-       set $request_type = "RECV_REQ"
			
 
				+       printf "RECV_REQ"
			
 
				     end
			
 
				     if $request->request_type == WAIT_REQ
			
 
				-       set $request_type = "WAIT_REQ"
			
 
				+       printf "WAIT_REQ"
			
 
				     end
			
 
				     if $request->request_type == TEST_REQ
			
 
				-       set $request_type = "TEST_REQ"
			
 
				+       printf "TEST_REQ"
			
 
				     end
			
 
				     if $request->request_type == BARRIER_REQ
			
 
				-       set $request_type = "BARRIER_REQ"
			
 
				+       printf "BARRIER_REQ"
			
 
				     end
			
 
				     if $request->request_type == PROBE_REQ
			
 
				-       set $request_type = "PROBE_REQ"
			
 
				+       printf "PROBE_REQ"
			
 
				     end
			
 
				     if $request->request_type == UNKNOWN_REQ
			
 
				-       set $request_type = "UNKNOWN_REQ"
			
 
				+       printf "UNKNOWN_REQ"
			
 
				     end
			
 
				-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
			
 
				+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
			
 
				 end
			
 
				 
			
 
				 define starpu-mpi-print-ready-recv-requests
			
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
 
				     end
			
 
				 end
			
 
				 
			
 
				+define starpu-mpi-print-requests-list
			
 
				+  set $list = $arg0
			
 
				+  set $request = $list->_head
			
 
				+  while $request
			
 
				+    starpu-mpi-print-request $request
			
 
				+    set $request = $request->_next
			
 
				+  end
			
 
				+end
			
 
				+
			
 
				+define starpu-mpi-print-requests-tree
			
 
				+  if $arg0
			
 
				+    starpu-mpi-print-requests-tree $arg0->children[0]
			
 
				+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
			
 
				+    starpu-mpi-print-requests-list (&($stage->list))
			
 
				+    starpu-mpi-print-requests-tree $arg0->children[1]
			
 
				+  end
			
 
				+end
			
 
				+
			
 
				 define starpu-mpi-print-ready-send-requests
			
 
				-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
			
 
				-    if $list
			
 
				-	set $request = $list.list._head
			
 
				-        while $request
			
 
				-            starpu-mpi-print-request $request
			
 
				-	    set $request = $request->_next
			
 
				-	end
			
 
				+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
			
 
				+  if _starpu_debug
			
 
				+    if $prio_list
			
 
				+        starpu-mpi-print-requests-list &$prio_list.list
			
 
				+    else
			
 
				+	printf "No ready send requests\n"
			
 
				+    end
			
 
				+  else
			
 
				+    if $prio_list.empty == 0
			
 
				+        starpu-mpi-print-requests-tree $prio_list.tree.root
			
 
				     else
			
 
				 	printf "No ready send requests\n"
			
 
				     end
			
 
				+  end
			
 
				 end
			
 
				 
			
 
				 define starpu-mpi-print-detached-requests
			
--- a/tools/starpu_fxt_tool.c
+++ b/tools/starpu_fxt_tool.c
@@ -81,7 +81,7 @@ static int parse_args(int argc, char **argv)
 
				 		{
			
 
				 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
			
 
				 			{
			
 
				-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
			
 
				+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
			
 
				 				return 7;
			
 
				 			}
			
 
				 			options.filenames[options.ninputfiles++] = argv[++i];
			
@@ -179,7 +179,7 @@ static int parse_args(int argc, char **argv)
 
				 		{
			
 
				 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
			
 
				 			{
			
 
				-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
			
 
				+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
			
 
				 				return 7;
			
 
				 			}
			
 
				 			options.filenames[options.ninputfiles++] = argv[i];
			
--- a/tools/starpu_perfmodel_recdump.c
+++ b/tools/starpu_perfmodel_recdump.c