5 years ago · 536fe80f62
--- a/ChangeLog
+++ b/ChangeLog
@@ -42,6 +42,7 @@ New features:
 
																   * Add a task prefetch level, to improve retaining data in accelerators so we
															
 
																     can make prefetch more aggressive.
															
 
																   * Add starpu_data_dup_ro().
															
 
																+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
															
 
																 Small changes:
															
 
																   * Add a synthetic energy efficiency testcase.
															
@@ -51,6 +52,7 @@ StarPU 1.3.5 (git revision xxx)
 
																 Small changes:
															
 
																   * Move MPI cache functions into the public API
															
 
																+  * Add STARPU_MPI_NOBIND environment variable.
															
 
																 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
															
 
																 ====================================================================
															
--- a/configure.ac
+++ b/configure.ac
@@ -249,6 +249,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 
																 	], [simgrid_lib_dir=no])
															
 
																 if test x$enable_simgrid = xyes ; then
															
 
																+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
															
 
																+
															
 
																    	if test -n "$SIMGRID_CFLAGS" ; then
															
 
																 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
															
 
																 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
															
@@ -836,7 +838,10 @@ if test x"$enable_native_winthreads" = xyes ; then
 
																 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
															
 
																 		AC_MSG_ERROR([pthread_create unavailable]))
															
 
																 else
															
 
																-    AC_CHECK_LIB([pthread], [pthread_create])
															
 
																+    AC_CHECK_LIB([pthread], [pthread_create], [
															
 
																+        LIBS="$LIBS -lpthread"
															
 
																+        STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lpthread"
															
 
																+    ])
															
 
																 fi
															
 
																 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
															
@@ -957,8 +962,8 @@ AC_CHECK_FUNCS([mkdtemp])
 
																 AC_CHECK_FUNCS([pread pwrite])
															
 
																-AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
															
 
																-                    enable_hdf5=$enableval, enable_hdf5=maybe)
															
 
																+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--enable-hdf5], [disable HDF5 support])],
															
 
																+                    enable_hdf5=$enableval, enable_hdf5=no)
															
 
																 if test "x$enable_hdf5" != xno ; then
															
 
																 	AC_ARG_WITH(hdf5-include-dir,
															
@@ -1017,8 +1022,11 @@ fi
 
																 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
															
 
																         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
															
 
																+	enable_hdf5=yes
															
 
																+else
															
 
																+	enable_hdf5=no
															
 
																 fi
															
 
																-AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
															
 
																+AM_CONDITIONAL(STARPU_HAVE_HDF5, test "x$enable_hdf5" = "xyes")
															
 
																 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
															
@@ -2523,8 +2531,8 @@ AC_SUBST(STARPU_EXPORT_DYNAMIC)
 
																 # Computes the maximum number of different kernels a message-passing sink
															
 
																 # can lookup for and launch.
															
 
																 AC_MSG_CHECKING(Maximum number of message-passing kernels)
															
 
																-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
															
 
																-	      -enable-maxmpkernels=<number>],
															
 
																+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING(
															
 
																+	      [-enable-maxmpkernels=<number>],
															
 
																 	      [maximum number of kernels a message-passing sink can lookup
															
 
																 	      for and execute])],
															
 
																 	      maxmpkernels=$enableval, maxmpkernels=10)
															
@@ -3111,6 +3119,9 @@ fi
 
																 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
															
 
																 			[Disable multiple linear regression models])],
															
 
																 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
															
 
																+AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
															
 
																+			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
															
 
																+			enable_mlr_blas=$enableval, enable_mlr_blas=no)
															
 
																 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
															
 
																 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
															
@@ -3121,11 +3132,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 
																 	if test x$blas_lib = xnone ; then
															
 
																 	   use_system_lapack=no
															
 
																 	fi
															
 
																-	if test x$use_system_lapack = xyes; then
															
 
																+	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
															
 
																 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
															
 
																 		LDFLAGS="-llapack $LDFLAGS"
															
 
																 	else
															
 
																-		if test x$blas_lib = xmkl; then
															
 
																+		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
															
 
																 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
															
 
																 		else
															
 
																 			AC_MSG_CHECKING(whether min-dgels is linked)
															
@@ -3142,9 +3153,6 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 
																 					install_min_dgels=no
															
 
																 					support_mlr=no
															
 
																 				else
															
 
																-					if test ! -d $PWD/min-dgels; then
															
 
																-						cp -r $srcdir/min-dgels $PWD/
															
 
																-					fi
															
 
																 					AC_MSG_RESULT(yes)
															
 
																 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
															
 
																 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
															
@@ -3590,7 +3598,7 @@ if test "x$enable_shared" = xno; then
 
																         # No .so, so application will unexpected have to know which -l to
															
 
																         # use. Give them in .pc file.
															
 
																 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
															
 
																-	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
															
 
																+	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
															
 
																 fi
															
 
																 AC_SUBST(STARPU_EXPORTED_LIBS)
															
@@ -3631,6 +3639,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
																   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
															
 
																+  test -e tests/microbenchs/bandwidth_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/bandwidth_scheds.sh tests/microbenchs/
															
 
																   mkdir -p tests/energy
															
 
																   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
															
 
																   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
															
@@ -3806,6 +3815,7 @@ AC_MSG_NOTICE([
 
																                Scheduler Hypervisor:                          $build_sc_hypervisor
															
 
																                simgrid enabled:                               $enable_simgrid
															
 
																                ayudame enabled:                               $ayu_msg
															
 
																+               HDF5 enabled:                                  $enable_hdf5
															
 
																 	       Native fortran support:                        $enable_build_fortran
															
 
																 	       Native MPI fortran support:                    $use_mpi_fort
															
 
																 	       Support for multiple linear regression models: $support_mlr
															
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -352,7 +352,20 @@ use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necess
 
																 has not-so-stable performance. StarPU will force calibration (and thus ignore
															
 
																 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
															
 
																 made on each architecture, to avoid bad scheduling decisions just because the
															
 
																-first measurements were not so good. Details on the current performance model status
															
 
																+first measurements were not so good.
															
 
																+
															
 
																+Note that StarPU will not record the very first measurement for a given codelet
															
 
																+and a given size, because it would most often be hit by computation library
															
 
																+loading or initialization. StarPU will also throw measurements away if it
															
 
																+notices that after computing an average execution time, it notices that most
															
 
																+subsequent tasks have an execution time largely outside the computed average
															
 
																+("Too big deviation for model..." warning messages). By looking at the details
															
 
																+of the message and their reported measurements, it can highlight that your
															
 
																+computation library really has non-stable measurements, which is probably an
															
 
																+indication of an issue in the computation library, or the execution environment
															
 
																+(e.g. rogue daemons).
															
 
																+
															
 
																+Details on the current performance model status
															
 
																 can be obtained with the tool <c>starpu_perfmodel_display</c>: the
															
 
																 option <c>-l</c> lists the available performance models, and the
															
 
																 option <c>-s</c> allows to choose the performance model to be
															
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -375,7 +375,11 @@ parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
 
																 directory. These files are reused when \ref STARPU_CALIBRATE
															
 
																 environment variable is set to <c>1</c>, to recompute coefficients
															
 
																 based on the current, but also on the previous
															
 
																-executions. Additionally, when multiple linear regression models are
															
 
																+executions. By default StarPU uses a lightweight dgels implementation, but the
															
 
																+\ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
															
 
																+used to make StarPU use a system-provided dgels BLAS.
															
 
																+
															
 
																+Additionally, when multiple linear regression models are
															
 
																 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
															
 
																 <c>model->combinations</c> are not defined, StarPU will still write
															
 
																 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow
															
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -167,8 +167,12 @@ theory results), see the \ref STARPU_SIMGRID_TRANSFER_COST, \ref STARPU_SIMGRID_
 
																 \section SimulationMPIApplications MPI Applications
															
 
																-StarPU-MPI applications can also be run in SimGrid mode. It needs to be compiled
															
 
																-with \c smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
															
 
																+StarPU-MPI applications can also be run in SimGrid mode. smpi currently requires
															
 
																+that StarPU be build statically only, so <c>--disable-shared</c> needs to be
															
 
																+passed to <c>./configure</c>.
															
 
																+
															
 
																+The application needs to be compiled with \c smpicc, and run using the
															
 
																+<c>starpu_smpirun</c> script, for instance:
															
 
																 \verbatim
															
 
																 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
															
@@ -182,6 +186,13 @@ in case of a heterogeneous platform, it is possible to use the
 
																 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
															
 
																 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
															
 
																+So as to use FxT traces, libfxt also needs to be built statically, <b>and</b>
															
 
																+with dynamic linking flags, i.e. with
															
 
																+
															
 
																+\verbatim
															
 
																+CFLAGS=-fPIC ./configure --enable-static
															
 
																+\endverbatim
															
 
																+
															
 
																 \section SimulationDebuggingApplications Debugging Applications
															
 
																 By default, SimGrid uses its own implementation of threads, which prevents \c gdb
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -255,6 +255,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 
																 workers.
															
 
																 </dd>
															
 
																+<dt>STARPU_MPI_NOBIND</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_MPI_NOBIND
															
 
																+\addindex __env__STARPU_MPI_NOBIND
															
 
																+Setting it to non-zero will prevent StarPU from binding the MPI to
															
 
																+a separate core. This is for instance useful when running the testsuite on a single system.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_WORKERS_CUDAID</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_WORKERS_CUDAID
															
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -571,11 +571,11 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 
																 Enable linking with LevelDB if available
															
 
																 </dd>
															
 
																-<dt>--disable-hdf5</dt>
															
 
																+<dt>--enable-hdf5</dt>
															
 
																 <dd>
															
 
																-\anchor disable-hdf5
															
 
																-\addindex __configure__--disable-hdf5
															
 
																-Disable building HDF5 support.
															
 
																+\anchor enable-hdf5
															
 
																+\addindex __configure__--enable-hdf5
															
 
																+Enable building HDF5 support.
															
 
																 </dd>
															
 
																 <dt>--with-hdf5-include-dir=<c>path</c></dt>
															
@@ -768,6 +768,14 @@ this parameter is 10. Experimental.
 
																 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
															
 
																 </dd>
															
 
																+<dt>--enable-mlr-system-blas</dt>
															
 
																+<dd>
															
 
																+\anchor enable-mlr-system-blas
															
 
																+\addindex __configure__--enable-mlr-system-blas
															
 
																+Allow to make multiple linear regression models use the system-provided BLAS for dgels
															
 
																+(see \ref PerformanceModelExample)
															
 
																+</dd>
															
 
																+
															
 
																 </dl>
															
 
																 */
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 
																 	api/csr_data_interface			\
															
 
																 	api/matrix_data_interface		\
															
 
																 	api/multiformat_data_interface		\
															
 
																+	api/tensor_data_interface		\
															
 
																 	api/variable_data_interface		\
															
 
																 	api/vector_data_interface		\
															
 
																 	api/void_data_interface
															
--- a/examples/api/bcsr_data_interface.c
+++ b/examples/api/bcsr_data_interface.c
@@ -17,6 +17,17 @@
 
																 // This program checks that the implementation of the BCSR data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
															
 
																+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
															
 
																+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
															
 
																+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
															
 
																+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
															
 
																+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
															
 
																+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
															
 
																+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
															
 
																+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
															
 
																+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
															
 
																+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
															
 
																 #include "../../src/datawizard/interfaces/bcsr_interface.c"
															
 
																 int main()
															
--- a/examples/api/block_data_interface.c
+++ b/examples/api/block_data_interface.c
@@ -17,6 +17,16 @@
 
																 // This program checks that the implementation of the block data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_block_ops my_starpu_interface_block_ops
															
 
																+#define starpu_block_data_register my_starpu_block_data_register
															
 
																+#define starpu_block_ptr_register my_starpu_block_ptr_register
															
 
																+#define starpu_block_get_nx my_starpu_block_get_nx
															
 
																+#define starpu_block_get_ny my_starpu_block_get_ny
															
 
																+#define starpu_block_get_nz my_starpu_block_get_nz
															
 
																+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
															
 
																+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
															
 
																+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
															
 
																+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
															
 
																 #include "../../src/datawizard/interfaces/block_interface.c"
															
 
																 int main()
															
--- a/examples/api/coo_data_interface.c
+++ b/examples/api/coo_data_interface.c
@@ -17,6 +17,8 @@
 
																 // This program checks that the implementation of the COO data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
															
 
																+#define starpu_coo_data_register my_starpu_coo_data_register
															
 
																 #include "../../src/datawizard/interfaces/coo_interface.c"
															
 
																 int main()
															
--- a/examples/api/csr_data_interface.c
+++ b/examples/api/csr_data_interface.c
@@ -17,6 +17,15 @@
 
																 // This program checks that the implementation of the CSR data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
															
 
																+#define starpu_csr_data_register my_starpu_csr_data_register
															
 
																+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
															
 
																+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
															
 
																+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
															
 
																+#define starpu_csr_get_elemsize my_starpu_csr_get_elemsize
															
 
																+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
															
 
																+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
															
 
																+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
															
 
																 #include "../../src/datawizard/interfaces/csr_interface.c"
															
 
																 int main()
															
--- a/examples/api/matrix_data_interface.c
+++ b/examples/api/matrix_data_interface.c
@@ -17,6 +17,16 @@
 
																 // This program checks that the implementation of the matrix data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
															
 
																+#define starpu_matrix_data_register my_starpu_matrix_data_register
															
 
																+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
															
 
																+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
															
 
																+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
															
 
																+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
															
 
																+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
															
 
																+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
															
 
																+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
															
 
																+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
															
 
																 #include "../../src/datawizard/interfaces/matrix_interface.c"
															
 
																 int main()
															
--- a/examples/api/multiformat_data_interface.c
+++ b/examples/api/multiformat_data_interface.c
@@ -17,6 +17,8 @@
 
																 // This program checks that the implementation of the multiformat data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
															
 
																+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
															
 
																 #include "../../src/datawizard/interfaces/multiformat_interface.c"
															
 
																 int main()
															
--- a/examples/api/tensor_data_interface.c
+++ b/examples/api/tensor_data_interface.c
@@ -0,0 +1,37 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+// This program checks that the implementation of the tensor data
															
 
																+// interface only uses StarPU's public API
															
 
																+
															
 
																+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
															
 
																+#define starpu_tensor_data_register my_starpu_tensor_data_register
															
 
																+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
															
 
																+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
															
 
																+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
															
 
																+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
															
 
																+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
															
 
																+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
															
 
																+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
															
 
																+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
															
 
																+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
															
 
																+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
															
 
																+#include "../../src/datawizard/interfaces/tensor_interface.c"
															
 
																+
															
 
																+int main()
															
 
																+{
															
 
																+        return 0;
															
 
																+}
															
--- a/examples/api/variable_data_interface.c
+++ b/examples/api/variable_data_interface.c
@@ -17,6 +17,11 @@
 
																 // This program checks that the implementation of the variable data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
															
 
																+#define starpu_variable_data_register my_starpu_variable_data_register
															
 
																+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
															
 
																+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
															
 
																+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
															
 
																 #include "../../src/datawizard/interfaces/variable_interface.c"
															
 
																 int main()
															
--- a/examples/api/vector_data_interface.c
+++ b/examples/api/vector_data_interface.c
@@ -17,6 +17,14 @@
 
																 // This program checks that the implementation of the vector data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
															
 
																+#define starpu_vector_data_register my_starpu_vector_data_register
															
 
																+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
															
 
																+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
															
 
																+#define starpu_vector_get_nx my_starpu_vector_get_nx
															
 
																+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
															
 
																+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
															
 
																+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
															
 
																 #include "../../src/datawizard/interfaces/vector_interface.c"
															
 
																 int main()
															
--- a/examples/api/void_data_interface.c
+++ b/examples/api/void_data_interface.c
@@ -17,6 +17,8 @@
 
																 // This program checks that the implementation of the void data
															
 
																 // interface only uses StarPU's public API
															
 
																+#define starpu_interface_void_ops my_starpu_interface_void_ops
															
 
																+#define starpu_void_data_register my_starpu_void_data_register
															
 
																 #include "../../src/datawizard/interfaces/void_interface.c"
															
 
																 int main()
															
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -92,29 +92,26 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 		}
															
 
																 		starpu_data_wont_use(sdatakk);
															
 
																-		for (m = k+1; m<nblocks; m++)
															
 
																+		for (n = k+1; n<nblocks; n++)
															
 
																 		{
															
 
																-                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
															
 
																-			for (n = k+1; n<nblocks; n++)
															
 
																+                        starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
															
 
																+			for (m = n; m<nblocks; m++)
															
 
																 			{
															
 
																-				if (n <= m)
															
 
																-                                {
															
 
																-					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
															
 
																-					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
															
 
																-
															
 
																-					ret = starpu_task_insert(&cl22,
															
 
																-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																-								 STARPU_R, sdatamk,
															
 
																-								 STARPU_R, sdatank,
															
 
																-								 cl22.modes[2], sdatamn,
															
 
																-								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																-								 STARPU_TAG_ONLY, TAG22(k,m,n),
															
 
																-								 0);
															
 
																-					if (ret == -ENODEV) return 77;
															
 
																-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																-                                }
															
 
																+				starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
															
 
																+				starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
															
 
																+
															
 
																+				ret = starpu_task_insert(&cl22,
															
 
																+							 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+							 STARPU_R, sdatamk,
															
 
																+							 STARPU_R, sdatank,
															
 
																+							 cl22.modes[2], sdatamn,
															
 
																+							 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																+							 STARPU_TAG_ONLY, TAG22(k,m,n),
															
 
																+							 0);
															
 
																+				if (ret == -ENODEV) return 77;
															
 
																+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																 			}
															
 
																-			starpu_data_wont_use(sdatamk);
															
 
																+			starpu_data_wont_use(sdatank);
															
 
																 		}
															
 
																 		starpu_iteration_pop();
															
 
																 	}
															
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -169,6 +169,11 @@ int main(void)
 
																         print_data(handle);
															
 
																         starpu_data_unregister(handle);
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																+#endif
															
 
																+
															
 
																         /* Print result block */
															
 
																         FPRINTF(stderr, "OUT Block\n");
															
 
																         print_block(block, NX, NY, NZ, NX, NX*NY);
															
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 
																 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
															
 
																 	{
															
 
																-		size_t global=nx*ny*nz;
															
 
																-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
															
 
																+		size_t global[3]={nx,ny,nz};
															
 
																+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	}
															
 
																 	starpu_opencl_release_kernel(kernel);
															
--- a/examples/filters/fblock_opencl_kernel.cl
+++ b/examples/filters/fblock_opencl_kernel.cl
@@ -18,14 +18,17 @@
 
																 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
															
 
																 {
															
 
																-        int i, j, k;
															
 
																-        block = (__global char *)block + offset;
															
 
																-        for(k=0; k<nz ; k++)
															
 
																-	{
															
 
																-                for(j=0; j<ny ; j++)
															
 
																-		{
															
 
																-                        for(i=0; i<nx ; i++)
															
 
																-                                block[(k*ldz)+(j*ldy)+i] = factor;
															
 
																-                }
															
 
																-        }
															
 
																+	const int idx = get_global_id(0);
															
 
																+	const int idy = get_global_id(1);
															
 
																+	const int idz = get_global_id(2);
															
 
																+	if (idx >= nx)
															
 
																+		return;
															
 
																+	if (idy >= ny)
															
 
																+		return;
															
 
																+	if (idz >= nz)
															
 
																+		return;
															
 
																+
															
 
																+	block = (__global int*) ((__global char *)block + offset);
															
 
																+	int i = idz*ldz + idy*ldy + idx;
															
 
																+	block[i] = factor;
															
 
																 }
															
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
																 /**
															
 
																    Similar to starpu_data_release(), except that the data
															
 
																-   will be available on the given memory \p node instead of main memory.
															
 
																+   was made available on the given memory \p node instead of main memory.
															
 
																    The \p node parameter must be exactly the same as the corresponding \c
															
 
																    starpu_data_acquire_on_node* call.
															
 
																 */
															
 
																 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
															
 
																+/**
															
 
																+   Partly release the piece of data acquired by the application either by
															
 
																+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
															
 
																+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
															
 
																+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
															
 
																+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
															
 
																+   case this is equivalent to calling starpu_data_release().
															
 
																+*/
															
 
																+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
															
 
																+
															
 
																+/**
															
 
																+   Similar to starpu_data_release_to(), except that the data
															
 
																+   was made available on the given memory \p node instead of main memory.
															
 
																+   The \p node parameter must be exactly the same as the corresponding \c
															
 
																+   starpu_data_acquire_on_node* call.
															
 
																+*/
															
 
																+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
															
 
																+
															
 
																 /** @} */
															
 
																 /**
															
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -69,42 +69,36 @@ struct starpu_fxt_options
 
																 	char *number_events_path;
															
 
																 	char *anim_path;
															
 
																 	char *states_path;
															
 
																+	char worker_names[STARPU_NMAXWORKERS][256];
															
 
																+	int nworkers;
															
 
																+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
															
 
																 	/**
															
 
																 	   In case we are going to gather multiple traces (e.g in the case of
															
 
																 	   MPI processes), we may need to prefix the name of the containers.
															
 
																 	*/
															
 
																 	char *file_prefix;
															
 
																+
															
 
																 	/**
															
 
																 	   In case we are going to gather multiple traces (e.g in the case of
															
 
																-	   MPI processes), we may need to prefix the name of the containers.
															
 
																+	   MPI processes), this variable stores the time offset with the rank 0.
															
 
																 	*/
															
 
																 	uint64_t file_offset;
															
 
																+
															
 
																 	/**
															
 
																 	   In case we are going to gather multiple traces (e.g in the case of
															
 
																-	   MPI processes), we may need to prefix the name of the containers.
															
 
																+	   MPI processes), this variable stores the MPI rank of the trace file.
															
 
																 	*/
															
 
																 	int file_rank;
															
 
																 	/**
															
 
																-	   Output parameters
															
 
																-	*/
															
 
																-	char worker_names[STARPU_NMAXWORKERS][256];
															
 
																-	/**
															
 
																-	   Output parameters
															
 
																-	*/
															
 
																-	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
															
 
																-	/**
															
 
																-	   Output parameters
															
 
																-	*/
															
 
																-	int nworkers;
															
 
																-
															
 
																-	/**
															
 
																 	   In case we want to dump the list of codelets to an external tool
															
 
																 	*/
															
 
																 	struct starpu_fxt_codelet_event **dumped_codelets;
															
 
																+
															
 
																 	/**
															
 
																-	   In case we want to dump the list of codelets to an external tool
															
 
																+	   In case we want to dump the list of codelets to an external tool, number
															
 
																+	   of dumped codelets.
															
 
																 	*/
															
 
																 	long dumped_codelets_count;
															
 
																 };
															
--- a/min-dgels/Makefile.in
+++ b/min-dgels/Makefile.in
@@ -7,13 +7,17 @@ ADDITIONAL=additional
 
																 all:
															
 
																 	mkdir -p build
															
 
																-	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
															
 
																+	[ -d "$(CLAPACK)" ] || ( cp -a $(srcdir)/$(CLAPACK) . ; chmod -R +rwX $(CLAPACK) )
															
 
																 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
															
 
																 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
															
 
																-	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
															
 
																+	[ -d "$(ADDITIONAL)" ] || ( cp -a $(srcdir)/$(ADDITIONAL) . ; chmod -R +rwX $(ADDITIONAL) )
															
 
																 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
															
 
																 install:
															
 
																+installcheck:
															
 
																+uninstall:
															
 
																+distuninstallcheck:
															
 
																+dvi:
															
 
																 clean:
															
 
																 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
															
@@ -21,6 +25,7 @@ clean:
 
																 	rm -rf build *~
															
 
																 distclean: clean
															
 
																+	[ -f Makefile.in ] || rm -fr $(CLAPACK) $(ADDITIONAL)
															
 
																 # This part is needed by StarPU
															
--- a/min-dgels/additional/blaswrap.h
+++ b/min-dgels/additional/blaswrap.h
@@ -5,156 +5,4 @@
 
																 #ifndef __BLASWRAP_H
															
 
																 #define __BLASWRAP_H
															
 
																-#ifndef NO_BLAS_WRAP
															
 
																- 
															
 
																-/* BLAS1 routines */
															
 
																-#define _starpu_srotg_ f2c_srotg
															
 
																-#define _starpu_crotg_ f2c_crotg
															
 
																-#define _starpu_drotg_ f2c_drotg
															
 
																-#define _starpu_zrotg_ f2c_zrotg
															
 
																-#define _starpu_srotmg_ f2c_srotmg
															
 
																-#define _starpu_drotmg_ f2c_drotmg
															
 
																-#define _starpu_srot_ f2c_srot
															
 
																-#define _starpu_drot_ f2c_drot
															
 
																-#define _starpu_srotm_ f2c_srotm
															
 
																-#define _starpu_drotm_ f2c_drotm
															
 
																-#define _starpu_sswap_ f2c_sswap
															
 
																-#define _starpu_dswap_ f2c_dswap
															
 
																-#define _starpu_cswap_ f2c_cswap
															
 
																-#define _starpu_zswap_ f2c_zswap
															
 
																-#define _starpu_sscal_ f2c_sscal
															
 
																-#define _starpu_dscal_ f2c_dscal
															
 
																-#define _starpu_cscal_ f2c_cscal
															
 
																-#define _starpu_zscal_ f2c_zscal
															
 
																-#define _starpu_csscal_ f2c_csscal
															
 
																-#define _starpu_zdscal_ f2c_zdscal
															
 
																-#define _starpu_scopy_ f2c_scopy
															
 
																-#define _starpu_dcopy_ f2c_dcopy
															
 
																-#define _starpu_ccopy_ f2c_ccopy
															
 
																-#define _starpu_zcopy_ f2c_zcopy
															
 
																-#define _starpu_saxpy_ f2c_saxpy
															
 
																-#define _starpu_daxpy_ f2c_daxpy
															
 
																-#define _starpu_caxpy_ f2c_caxpy
															
 
																-#define _starpu_zaxpy_ f2c_zaxpy
															
 
																-#define _starpu_sdot_ f2c_sdot
															
 
																-#define _starpu_ddot_ f2c_ddot
															
 
																-#define _starpu_cdotu_ f2c_cdotu
															
 
																-#define _starpu_zdotu_ f2c_zdotu
															
 
																-#define _starpu_cdotc_ f2c_cdotc
															
 
																-#define _starpu_zdotc_ f2c_zdotc
															
 
																-#define _starpu_snrm2_ f2c_snrm2
															
 
																-#define _starpu_dnrm2_ f2c_dnrm2
															
 
																-#define _starpu_scnrm2_ f2c_scnrm2
															
 
																-#define _starpu_dznrm2_ f2c_dznrm2
															
 
																-#define _starpu_sasum_ f2c_sasum
															
 
																-#define _starpu_dasum_ f2c_dasum
															
 
																-#define _starpu_scasum_ f2c_scasum
															
 
																-#define _starpu_dzasum_ f2c_dzasum
															
 
																-#define _starpu_isamax_ f2c_isamax
															
 
																-#define _starpu_idamax_ f2c_idamax
															
 
																-#define _starpu_icamax_ f2c_icamax
															
 
																-#define _starpu_izamax_ f2c_izamax
															
 
																- 
															
 
																-/* BLAS2 routines */
															
 
																-#define _starpu_sgemv_ f2c_sgemv
															
 
																-#define _starpu_dgemv_ f2c_dgemv
															
 
																-#define _starpu_cgemv_ f2c_cgemv
															
 
																-#define _starpu_zgemv_ f2c_zgemv
															
 
																-#define _starpu_sgbmv_ f2c_sgbmv
															
 
																-#define _starpu_dgbmv_ f2c_dgbmv
															
 
																-#define _starpu_cgbmv_ f2c_cgbmv
															
 
																-#define _starpu_zgbmv_ f2c_zgbmv
															
 
																-#define _starpu_chemv_ f2c_chemv
															
 
																-#define _starpu_zhemv_ f2c_zhemv
															
 
																-#define _starpu_chbmv_ f2c_chbmv
															
 
																-#define _starpu_zhbmv_ f2c_zhbmv
															
 
																-#define _starpu_chpmv_ f2c_chpmv
															
 
																-#define _starpu_zhpmv_ f2c_zhpmv
															
 
																-#define _starpu_ssymv_ f2c_ssymv
															
 
																-#define _starpu_dsymv_ f2c_dsymv
															
 
																-#define _starpu_ssbmv_ f2c_ssbmv
															
 
																-#define _starpu_dsbmv_ f2c_dsbmv
															
 
																-#define _starpu_sspmv_ f2c_sspmv
															
 
																-#define _starpu_dspmv_ f2c_dspmv
															
 
																-#define _starpu_strmv_ f2c_strmv
															
 
																-#define _starpu_dtrmv_ f2c_dtrmv
															
 
																-#define _starpu_ctrmv_ f2c_ctrmv
															
 
																-#define _starpu_ztrmv_ f2c_ztrmv
															
 
																-#define _starpu_stbmv_ f2c_stbmv
															
 
																-#define _starpu_dtbmv_ f2c_dtbmv
															
 
																-#define _starpu_ctbmv_ f2c_ctbmv
															
 
																-#define _starpu_ztbmv_ f2c_ztbmv
															
 
																-#define _starpu_stpmv_ f2c_stpmv
															
 
																-#define _starpu_dtpmv_ f2c_dtpmv
															
 
																-#define _starpu_ctpmv_ f2c_ctpmv
															
 
																-#define _starpu_ztpmv_ f2c_ztpmv
															
 
																-#define _starpu_strsv_ f2c_strsv
															
 
																-#define _starpu_dtrsv_ f2c_dtrsv
															
 
																-#define _starpu_ctrsv_ f2c_ctrsv
															
 
																-#define _starpu_ztrsv_ f2c_ztrsv
															
 
																-#define _starpu_stbsv_ f2c_stbsv
															
 
																-#define _starpu_dtbsv_ f2c_dtbsv
															
 
																-#define _starpu_ctbsv_ f2c_ctbsv
															
 
																-#define _starpu_ztbsv_ f2c_ztbsv
															
 
																-#define _starpu_stpsv_ f2c_stpsv
															
 
																-#define _starpu_dtpsv_ f2c_dtpsv
															
 
																-#define _starpu_ctpsv_ f2c_ctpsv
															
 
																-#define _starpu_ztpsv_ f2c_ztpsv
															
 
																-#define _starpu_sger_ f2c_sger
															
 
																-#define _starpu_dger_ f2c_dger
															
 
																-#define _starpu_cgeru_ f2c_cgeru
															
 
																-#define _starpu_zgeru_ f2c_zgeru
															
 
																-#define _starpu_cgerc_ f2c_cgerc
															
 
																-#define _starpu_zgerc_ f2c_zgerc
															
 
																-#define _starpu_cher_ f2c_cher
															
 
																-#define _starpu_zher_ f2c_zher
															
 
																-#define _starpu_chpr_ f2c_chpr
															
 
																-#define _starpu_zhpr_ f2c_zhpr
															
 
																-#define _starpu_cher2_ f2c_cher2
															
 
																-#define _starpu_zher2_ f2c_zher2
															
 
																-#define _starpu_chpr2_ f2c_chpr2
															
 
																-#define _starpu_zhpr2_ f2c_zhpr2
															
 
																-#define _starpu_ssyr_ f2c_ssyr
															
 
																-#define _starpu_dsyr_ f2c_dsyr
															
 
																-#define _starpu_sspr_ f2c_sspr
															
 
																-#define _starpu_dspr_ f2c_dspr
															
 
																-#define _starpu_ssyr2_ f2c_ssyr2
															
 
																-#define _starpu_dsyr2_ f2c_dsyr2
															
 
																-#define _starpu_sspr2_ f2c_sspr2
															
 
																-#define _starpu_dspr2_ f2c_dspr2
															
 
																- 
															
 
																-/* BLAS3 routines */
															
 
																-#define _starpu_sgemm_ f2c_sgemm
															
 
																-#define _starpu_dgemm_ f2c_dgemm
															
 
																-#define _starpu_cgemm_ f2c_cgemm
															
 
																-#define _starpu_zgemm_ f2c_zgemm
															
 
																-#define _starpu_ssymm_ f2c_ssymm
															
 
																-#define _starpu_dsymm_ f2c_dsymm
															
 
																-#define _starpu_csymm_ f2c_csymm
															
 
																-#define _starpu_zsymm_ f2c_zsymm
															
 
																-#define _starpu_chemm_ f2c_chemm
															
 
																-#define _starpu_zhemm_ f2c_zhemm
															
 
																-#define _starpu_ssyrk_ f2c_ssyrk
															
 
																-#define _starpu_dsyrk_ f2c_dsyrk
															
 
																-#define _starpu_csyrk_ f2c_csyrk
															
 
																-#define _starpu_zsyrk_ f2c_zsyrk
															
 
																-#define _starpu_cherk_ f2c_cherk
															
 
																-#define _starpu_zherk_ f2c_zherk
															
 
																-#define _starpu_ssyr2k_ f2c_ssyr2k
															
 
																-#define _starpu_dsyr2k_ f2c_dsyr2k
															
 
																-#define _starpu_csyr2k_ f2c_csyr2k
															
 
																-#define _starpu_zsyr2k_ f2c_zsyr2k
															
 
																-#define _starpu_cher2k_ f2c_cher2k
															
 
																-#define _starpu_zher2k_ f2c_zher2k
															
 
																-#define _starpu_strmm_ f2c_strmm
															
 
																-#define _starpu_dtrmm_ f2c_dtrmm
															
 
																-#define _starpu_ctrmm_ f2c_ctrmm
															
 
																-#define _starpu_ztrmm_ f2c_ztrmm
															
 
																-#define _starpu_strsm_ f2c_strsm
															
 
																-#define _starpu_dtrsm_ f2c_dtrsm
															
 
																-#define _starpu_ctrsm_ f2c_ctrsm
															
 
																-#define _starpu_ztrsm_ f2c_ztrsm
															
 
																-
															
 
																-#endif /* NO_BLAS_WRAP */
															
 
																-
															
 
																 #endif /* __BLASWRAP_H */
															
--- a/min-dgels/base/F2CLIBS/libf2c/Makefile
+++ b/min-dgels/base/F2CLIBS/libf2c/Makefile
@@ -175,6 +175,14 @@ xwsne.o:	fio.h
 
																 xwsne.o:	lio.h
															
 
																 xwsne.o:	fmt.h
															
 
																+main.o:		signal1.h
															
 
																+signal_.o:	signal1.h
															
 
																+s_paus.o:	signal1.h
															
 
																+
															
 
																+err.o:		sysdep1.h
															
 
																+fio.h:		sysdep1.h
															
 
																+util.c:		sysdep1.h
															
 
																+
															
 
																 arith.h: arithchk.c
															
 
																 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
															
 
																 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm
															
--- a/min-dgels/base/INCLUDE/blaswrap.h
+++ b/min-dgels/base/INCLUDE/blaswrap.h
@@ -5,156 +5,4 @@
 
																 #ifndef __BLASWRAP_H
															
 
																 #define __BLASWRAP_H
															
 
																-#ifndef NO_BLAS_WRAP
															
 
																- 
															
 
																-/* BLAS1 routines */
															
 
																-#define _starpu_srotg_ f2c_srotg
															
 
																-#define _starpu_crotg_ f2c_crotg
															
 
																-#define _starpu_drotg_ f2c_drotg
															
 
																-#define _starpu_zrotg_ f2c_zrotg
															
 
																-#define _starpu_srotmg_ f2c_srotmg
															
 
																-#define _starpu_drotmg_ f2c_drotmg
															
 
																-#define _starpu_srot_ f2c_srot
															
 
																-#define _starpu_drot_ f2c_drot
															
 
																-#define _starpu_srotm_ f2c_srotm
															
 
																-#define _starpu_drotm_ f2c_drotm
															
 
																-#define _starpu_sswap_ f2c_sswap
															
 
																-#define _starpu_dswap_ f2c_dswap
															
 
																-#define _starpu_cswap_ f2c_cswap
															
 
																-#define _starpu_zswap_ f2c_zswap
															
 
																-#define _starpu_sscal_ f2c_sscal
															
 
																-#define _starpu_dscal_ f2c_dscal
															
 
																-#define _starpu_cscal_ f2c_cscal
															
 
																-#define _starpu_zscal_ f2c_zscal
															
 
																-#define _starpu_csscal_ f2c_csscal
															
 
																-#define _starpu_zdscal_ f2c_zdscal
															
 
																-#define _starpu_scopy_ f2c_scopy
															
 
																-#define _starpu_dcopy_ f2c_dcopy
															
 
																-#define _starpu_ccopy_ f2c_ccopy
															
 
																-#define _starpu_zcopy_ f2c_zcopy
															
 
																-#define _starpu_saxpy_ f2c_saxpy
															
 
																-#define _starpu_daxpy_ f2c_daxpy
															
 
																-#define _starpu_caxpy_ f2c_caxpy
															
 
																-#define _starpu_zaxpy_ f2c_zaxpy
															
 
																-#define _starpu_sdot_ f2c_sdot
															
 
																-#define _starpu_ddot_ f2c_ddot
															
 
																-#define _starpu_cdotu_ f2c_cdotu
															
 
																-#define _starpu_zdotu_ f2c_zdotu
															
 
																-#define _starpu_cdotc_ f2c_cdotc
															
 
																-#define _starpu_zdotc_ f2c_zdotc
															
 
																-#define _starpu_snrm2_ f2c_snrm2
															
 
																-#define _starpu_dnrm2_ f2c_dnrm2
															
 
																-#define _starpu_scnrm2_ f2c_scnrm2
															
 
																-#define _starpu_dznrm2_ f2c_dznrm2
															
 
																-#define _starpu_sasum_ f2c_sasum
															
 
																-#define _starpu_dasum_ f2c_dasum
															
 
																-#define _starpu_scasum_ f2c_scasum
															
 
																-#define _starpu_dzasum_ f2c_dzasum
															
 
																-#define _starpu_isamax_ f2c_isamax
															
 
																-#define _starpu_idamax_ f2c_idamax
															
 
																-#define _starpu_icamax_ f2c_icamax
															
 
																-#define _starpu_izamax_ f2c_izamax
															
 
																- 
															
 
																-/* BLAS2 routines */
															
 
																-#define _starpu_sgemv_ f2c_sgemv
															
 
																-#define _starpu_dgemv_ f2c_dgemv
															
 
																-#define _starpu_cgemv_ f2c_cgemv
															
 
																-#define _starpu_zgemv_ f2c_zgemv
															
 
																-#define _starpu_sgbmv_ f2c_sgbmv
															
 
																-#define _starpu_dgbmv_ f2c_dgbmv
															
 
																-#define _starpu_cgbmv_ f2c_cgbmv
															
 
																-#define _starpu_zgbmv_ f2c_zgbmv
															
 
																-#define _starpu_chemv_ f2c_chemv
															
 
																-#define _starpu_zhemv_ f2c_zhemv
															
 
																-#define _starpu_chbmv_ f2c_chbmv
															
 
																-#define _starpu_zhbmv_ f2c_zhbmv
															
 
																-#define _starpu_chpmv_ f2c_chpmv
															
 
																-#define _starpu_zhpmv_ f2c_zhpmv
															
 
																-#define _starpu_ssymv_ f2c_ssymv
															
 
																-#define _starpu_dsymv_ f2c_dsymv
															
 
																-#define _starpu_ssbmv_ f2c_ssbmv
															
 
																-#define _starpu_dsbmv_ f2c_dsbmv
															
 
																-#define _starpu_sspmv_ f2c_sspmv
															
 
																-#define _starpu_dspmv_ f2c_dspmv
															
 
																-#define _starpu_strmv_ f2c_strmv
															
 
																-#define _starpu_dtrmv_ f2c_dtrmv
															
 
																-#define _starpu_ctrmv_ f2c_ctrmv
															
 
																-#define _starpu_ztrmv_ f2c_ztrmv
															
 
																-#define _starpu_stbmv_ f2c_stbmv
															
 
																-#define _starpu_dtbmv_ f2c_dtbmv
															
 
																-#define _starpu_ctbmv_ f2c_ctbmv
															
 
																-#define _starpu_ztbmv_ f2c_ztbmv
															
 
																-#define _starpu_stpmv_ f2c_stpmv
															
 
																-#define _starpu_dtpmv_ f2c_dtpmv
															
 
																-#define _starpu_ctpmv_ f2c_ctpmv
															
 
																-#define _starpu_ztpmv_ f2c_ztpmv
															
 
																-#define _starpu_strsv_ f2c_strsv
															
 
																-#define _starpu_dtrsv_ f2c_dtrsv
															
 
																-#define _starpu_ctrsv_ f2c_ctrsv
															
 
																-#define _starpu_ztrsv_ f2c_ztrsv
															
 
																-#define _starpu_stbsv_ f2c_stbsv
															
 
																-#define _starpu_dtbsv_ f2c_dtbsv
															
 
																-#define _starpu_ctbsv_ f2c_ctbsv
															
 
																-#define _starpu_ztbsv_ f2c_ztbsv
															
 
																-#define _starpu_stpsv_ f2c_stpsv
															
 
																-#define _starpu_dtpsv_ f2c_dtpsv
															
 
																-#define _starpu_ctpsv_ f2c_ctpsv
															
 
																-#define _starpu_ztpsv_ f2c_ztpsv
															
 
																-#define _starpu_sger_ f2c_sger
															
 
																-#define _starpu_dger_ f2c_dger
															
 
																-#define _starpu_cgeru_ f2c_cgeru
															
 
																-#define _starpu_zgeru_ f2c_zgeru
															
 
																-#define _starpu_cgerc_ f2c_cgerc
															
 
																-#define _starpu_zgerc_ f2c_zgerc
															
 
																-#define _starpu_cher_ f2c_cher
															
 
																-#define _starpu_zher_ f2c_zher
															
 
																-#define _starpu_chpr_ f2c_chpr
															
 
																-#define _starpu_zhpr_ f2c_zhpr
															
 
																-#define _starpu_cher2_ f2c_cher2
															
 
																-#define _starpu_zher2_ f2c_zher2
															
 
																-#define _starpu_chpr2_ f2c_chpr2
															
 
																-#define _starpu_zhpr2_ f2c_zhpr2
															
 
																-#define _starpu_ssyr_ f2c_ssyr
															
 
																-#define _starpu_dsyr_ f2c_dsyr
															
 
																-#define _starpu_sspr_ f2c_sspr
															
 
																-#define _starpu_dspr_ f2c_dspr
															
 
																-#define _starpu_ssyr2_ f2c_ssyr2
															
 
																-#define _starpu_dsyr2_ f2c_dsyr2
															
 
																-#define _starpu_sspr2_ f2c_sspr2
															
 
																-#define _starpu_dspr2_ f2c_dspr2
															
 
																- 
															
 
																-/* BLAS3 routines */
															
 
																-#define _starpu_sgemm_ f2c_sgemm
															
 
																-#define _starpu_dgemm_ f2c_dgemm
															
 
																-#define _starpu_cgemm_ f2c_cgemm
															
 
																-#define _starpu_zgemm_ f2c_zgemm
															
 
																-#define _starpu_ssymm_ f2c_ssymm
															
 
																-#define _starpu_dsymm_ f2c_dsymm
															
 
																-#define _starpu_csymm_ f2c_csymm
															
 
																-#define _starpu_zsymm_ f2c_zsymm
															
 
																-#define _starpu_chemm_ f2c_chemm
															
 
																-#define _starpu_zhemm_ f2c_zhemm
															
 
																-#define _starpu_ssyrk_ f2c_ssyrk
															
 
																-#define _starpu_dsyrk_ f2c_dsyrk
															
 
																-#define _starpu_csyrk_ f2c_csyrk
															
 
																-#define _starpu_zsyrk_ f2c_zsyrk
															
 
																-#define _starpu_cherk_ f2c_cherk
															
 
																-#define _starpu_zherk_ f2c_zherk
															
 
																-#define _starpu_ssyr2k_ f2c_ssyr2k
															
 
																-#define _starpu_dsyr2k_ f2c_dsyr2k
															
 
																-#define _starpu_csyr2k_ f2c_csyr2k
															
 
																-#define _starpu_zsyr2k_ f2c_zsyr2k
															
 
																-#define _starpu_cher2k_ f2c_cher2k
															
 
																-#define _starpu_zher2k_ f2c_zher2k
															
 
																-#define _starpu_strmm_ f2c_strmm
															
 
																-#define _starpu_dtrmm_ f2c_dtrmm
															
 
																-#define _starpu_ctrmm_ f2c_ctrmm
															
 
																-#define _starpu_ztrmm_ f2c_ztrmm
															
 
																-#define _starpu_strsm_ f2c_strsm
															
 
																-#define _starpu_dtrsm_ f2c_dtrsm
															
 
																-#define _starpu_ctrsm_ f2c_ctrsm
															
 
																-#define _starpu_ztrsm_ f2c_ztrsm
															
 
																-
															
 
																-#endif /* NO_BLAS_WRAP */
															
 
																-
															
 
																 #endif /* __BLASWRAP_H */
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -47,10 +47,10 @@ endif
 
																 endif
															
 
																 if STARPU_HAVE_AM111
															
 
																-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
															
 
																 else
															
 
																-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
															
 
																+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
															
 
																 endif
															
 
																 if STARPU_MPI_CHECK
															
@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 
																 if !STARPU_SIMGRID
															
 
																 starpu_mpi_EXAMPLES	+=	\
															
 
																 	mpi_lu/plu_implicit_example_float	\
															
 
																-	mpi_lu/plu_implicit_example_double
															
 
																+	mpi_lu/plu_implicit_example_double	\
															
 
																+	mpi_lu/plu_outofcore_example_float	\
															
 
																+	mpi_lu/plu_outofcore_example_double
															
 
																 endif
															
 
																 mpi_lu_plu_example_float_LDADD =	\
															
--- a/mpi/examples/benchs/abstract_sendrecv_bench.c
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.c
@@ -22,25 +22,30 @@
 
																 void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
															
 
																 {
															
 
																 	uint64_t iterations = LOOPS_DEFAULT;
															
 
																+	uint64_t s = 0;
															
 
																+	uint64_t j = 0;
															
 
																+	uint64_t k = 0;
															
 
																 	if (mpi_rank >= 2)
															
 
																 	{
															
 
																+		starpu_pause();
															
 
																 		if (thread_barrier != NULL)
															
 
																 		{
															
 
																 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
															
 
																 		}
															
 
																-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+		for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																 		{
															
 
																 			iterations = bench_nb_iterations(iterations, s);
															
 
																 			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-			for (uint64_t j = 0; j < iterations; j++)
															
 
																+			for (j = 0; j < iterations; j++)
															
 
																 			{
															
 
																 				starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																 			}
															
 
																 		}
															
 
																+		starpu_resume();
															
 
																 		return;
															
 
																 	}
															
@@ -64,7 +69,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
																 	}
															
 
																 	global_tstart = starpu_timing_now();
															
 
																-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																 	{
															
 
																 		vector_send = malloc(s);
															
 
																 		vector_recv = malloc(s);
															
@@ -78,7 +83,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
																 		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-		for (uint64_t j = 0; j < iterations; j++)
															
 
																+		for (j = 0; j < iterations; j++)
															
 
																 		{
															
 
																 			if (mpi_rank == 0)
															
 
																 			{
															
@@ -111,7 +116,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
																 			const double d9_lat = lats[9 * (iterations - 1) / 10];
															
 
																 			double avg_lat = 0.0;
															
 
																-			for(uint64_t k = 0; k < iterations; k++)
															
 
																+			for(k = 0; k < iterations; k++)
															
 
																 			{
															
 
																 				avg_lat += lats[k];
															
 
																 			}
															
--- a/mpi/examples/benchs/burst_helper.c
+++ b/mpi/examples/benchs/burst_helper.c
@@ -45,7 +45,8 @@ void burst_init_data(int rank)
 
																 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
															
 
																 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		int i = 0;
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
															
 
																 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
															
@@ -62,7 +63,8 @@ void burst_free_data(int rank)
 
																 {
															
 
																 	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		int i = 0;
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			starpu_data_unregister(send_handles[i]);
															
 
																 			free(send_buffers[i]);
															
@@ -84,12 +86,13 @@ void burst_free_data(int rank)
 
																 void burst_bidir(int rank)
															
 
																 {
															
 
																 	int other_rank = (rank == 0) ? 1 : 0;
															
 
																+	int i = 0;
															
 
																 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
															
 
																 	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			recv_reqs[i] = NULL;
															
 
																 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
															
@@ -100,13 +103,13 @@ void burst_bidir(int rank)
 
																 	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			send_reqs[i] = NULL;
															
 
																 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
															
 
																 		}
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
															
 
																 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
															
@@ -120,10 +123,11 @@ void burst_bidir(int rank)
 
																 void burst_unidir(int sender, int receiver, int rank)
															
 
																 {
															
 
																 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
															
 
																+	int i = 0;
															
 
																 	if (rank == receiver)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			recv_reqs[i] = NULL;
															
 
																 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
															
@@ -134,7 +138,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
																 	if (rank == sender)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			send_reqs[i] = NULL;
															
 
																 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
															
@@ -143,7 +147,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
																 	if (rank == sender || rank == receiver)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
															
 
																 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
															
@@ -160,12 +164,13 @@ void burst_bidir_half_postponed(int rank)
 
																 {
															
 
																 	int other_rank = (rank == 0) ? 1 : 0;
															
 
																 	int received = 0;
															
 
																+	int i = 0;
															
 
																 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
															
 
																 	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			recv_reqs[i] = NULL;
															
 
																 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
															
@@ -176,7 +181,7 @@ void burst_bidir_half_postponed(int rank)
 
																 	if (rank == 0 || rank == 1)
															
 
																 	{
															
 
																-		for (int i = 0; i < (burst_nb_requests / 2); i++)
															
 
																+		for (i = 0; i < (burst_nb_requests / 2); i++)
															
 
																 		{
															
 
																 			send_reqs[i] = NULL;
															
 
																 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
															
@@ -184,13 +189,13 @@ void burst_bidir_half_postponed(int rank)
 
																 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
															
 
																-		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
															
 
																+		for (i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			send_reqs[i] = NULL;
															
 
																 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
															
 
																 		}
															
 
																-		for (int i = 0; i < burst_nb_requests; i++)
															
 
																+		for (i = 0; i < burst_nb_requests; i++)
															
 
																 		{
															
 
																 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
															
 
																 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
															
--- a/mpi/examples/benchs/gemm_helper.c
+++ b/mpi/examples/benchs/gemm_helper.c
@@ -98,8 +98,9 @@ static void cpu_init_matrix_random(void *descr[], void *arg)
 
																 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+	unsigned i = 0;
															
 
																-	for (unsigned i = 0; i < nx *ny; i++)
															
 
																+	for (i = 0; i < nx *ny; i++)
															
 
																 	{
															
 
																 		subA[i] = (TYPE) (starpu_drand48());
															
 
																 		subB[i] = (TYPE) (starpu_drand48());
															
@@ -113,8 +114,9 @@ static void cpu_init_matrix_zero(void *descr[], void *arg)
 
																 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+	unsigned i = 0;
															
 
																-	for (unsigned i = 0; i < nx *ny; i++)
															
 
																+	for (i = 0; i < nx *ny; i++)
															
 
																 	{
															
 
																 		subA[i] = (TYPE) (0);
															
 
																 	}
															
@@ -290,18 +292,21 @@ void gemm_add_polling_dependencies()
 
																 {
															
 
																 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
															
 
																 	unsigned nb_workers = starpu_worker_get_count();
															
 
																+	starpu_tag_t synchro_tag = 0;
															
 
																+	starpu_tag_t previous_tag = 0;
															
 
																+	starpu_tag_t next_tag = 0;
															
 
																-	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
															
 
																+	for (synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
															
 
																 	{
															
 
																 		// this synchro tag depends on tasks of previous column of tasks:
															
 
																-		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
															
 
																+		for (previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
															
 
																 		{
															
 
																 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
															
 
																 		}
															
 
																 		// tasks of the next column of tasks depend on this synchro tag:
															
 
																 		// this actually allows workers to poll for new tasks, while no task is available
															
 
																-		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
															
 
																+		for (next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
															
 
																 		{
															
 
																 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
															
 
																 		}
															
--- a/mpi/examples/benchs/sendrecv_bench.c
+++ b/mpi/examples/benchs/sendrecv_bench.c
@@ -27,9 +27,10 @@ int main(int argc, char **argv)
 
																 {
															
 
																 	int ret, rank, worldsize;
															
 
																 	int pause_workers = 0;
															
 
																+	int i = 0;
															
 
																-	for (int i = 1; i < argc; i++)
															
 
																+	for (i = 1; i < argc; i++)
															
 
																 	{
															
 
																 		if (strcmp(argv[i], "-p") == 0)
															
 
																 		{
															
--- a/mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
+++ b/mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
@@ -56,6 +56,8 @@ void cpu_task(void* descr[], void* args)
 
																 	double t1, t2;
															
 
																 	int asked_worker;
															
 
																 	int current_worker = starpu_worker_get_id();
															
 
																+	uint64_t j = 0;
															
 
																+	uint64_t k = 0;
															
 
																 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
															
@@ -64,7 +66,7 @@ void cpu_task(void* descr[], void* args)
 
																 	iterations = bench_nb_iterations(iterations, s);
															
 
																 	double* lats = malloc(sizeof(double) * iterations);
															
 
																-	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
															
 
																+	for (j = 0; j < NB_WARMUP_PINGPONGS; j++)
															
 
																 	{
															
 
																 		if (mpi_rank == 0)
															
 
																 		{
															
@@ -78,7 +80,7 @@ void cpu_task(void* descr[], void* args)
 
																 		}
															
 
																 	}
															
 
																-	for (uint64_t j = 0; j < iterations; j++)
															
 
																+	for (j = 0; j < iterations; j++)
															
 
																 	{
															
 
																 		if (mpi_rank == 0)
															
 
																 		{
															
@@ -107,7 +109,7 @@ void cpu_task(void* descr[], void* args)
 
																 		const double d9_lat = lats[9 * (iterations - 1) / 10];
															
 
																 		double avg_lat = 0.0;
															
 
																-		for(uint64_t k = 0; k < iterations; k++)
															
 
																+		for(k = 0; k < iterations; k++)
															
 
																 		{
															
 
																 			avg_lat += lats[k];
															
 
																 		}
															
@@ -167,6 +169,8 @@ int main(int argc, char **argv)
 
																 	unsigned cpu_count = starpu_cpu_worker_get_count();
															
 
																 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
															
 
																 	unsigned tag = 0;
															
 
																+	uint64_t s = 0;
															
 
																+	unsigned i = 0;
															
 
																 	int* workers = malloc(cpu_count * sizeof(int));
															
 
																 	float** vectors_send = malloc(cpu_count * sizeof(float*));
															
@@ -174,11 +178,11 @@ int main(int argc, char **argv)
 
																 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
															
 
																 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
															
 
																-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																 	{
															
 
																 		starpu_pause();
															
 
																-		for (unsigned i = 0; i < cpu_count; i++)
															
 
																+		for (i = 0; i < cpu_count; i++)
															
 
																 		{
															
 
																 			workers[i] = i;
															
 
																 			vectors_send[i] = malloc(s);
															
@@ -201,7 +205,7 @@ int main(int argc, char **argv)
 
																 		starpu_resume();
															
 
																 		starpu_task_wait_for_all();
															
 
																-		for (unsigned i = 0; i < cpu_count; i++)
															
 
																+		for (i = 0; i < cpu_count; i++)
															
 
																 		{
															
 
																 			starpu_data_unregister(handles_send[i]);
															
 
																 			starpu_data_unregister(handles_recv[i]);
															
--- a/mpi/examples/filters/filter.c
+++ b/mpi/examples/filters/filter.c
@@ -59,7 +59,7 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
 
																 	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
															
 
																 	STARPU_ASSERT(nchunks == 2);
															
 
																-	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%d is not a multiple of nchunks %d\n", nx, nchunks);
															
 
																+	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%u is not a multiple of nchunks %u\n", nx, nchunks);
															
 
																 	vector_child->id = vector_father->id;
															
 
																 	vector_child->nx = nx/2;
															
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -20,6 +20,64 @@
 
																 #include <limits.h>
															
 
																 #include <math.h>
															
 
																+/* This is from magma
															
 
																+
															
 
																+  -- Innovative Computing Laboratory
															
 
																+  -- Electrical Engineering and Computer Science Department
															
 
																+  -- University of Tennessee
															
 
																+  -- (C) Copyright 2009
															
 
																+
															
 
																+  Redistribution  and  use  in  source and binary forms, with or without
															
 
																+  modification,  are  permitted  provided  that the following conditions
															
 
																+  are met:
															
 
																+
															
 
																+  * Redistributions  of  source  code  must  retain  the above copyright
															
 
																+    notice,  this  list  of  conditions  and  the  following  disclaimer.
															
 
																+  * Redistributions  in  binary  form must reproduce the above copyright
															
 
																+    notice,  this list of conditions and the following disclaimer in the
															
 
																+    documentation  and/or other materials provided with the distribution.
															
 
																+  * Neither  the  name of the University of Tennessee, Knoxville nor the
															
 
																+    names of its contributors may be used to endorse or promote products
															
 
																+    derived from this software without specific prior written permission.
															
 
																+
															
 
																+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
															
 
																+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
															
 
																+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
															
 
																+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
															
 
																+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
															
 
																+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
															
 
																+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
															
 
																+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
															
 
																+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
															
 
																+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
															
 
																+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
															
 
																+
															
 
																+  */
															
 
																+
															
 
																+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
															
 
																+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
															
 
																+
															
 
																+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
															
 
																+
															
 
																+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
															
 
																+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
															
 
																+
															
 
																+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
															
 
																+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
															
 
																+
															
 
																+#define FMULS_TRSM FMULS_TRMM
															
 
																+#define FADDS_TRSM FMULS_TRMM
															
 
																+
															
 
																+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
															
 
																+
															
 
																+
															
 
																+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
															
 
																+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
															
 
																+
															
 
																+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
															
 
																+
															
 
																+/* End of magma code */
															
 
																+
															
 
																 /*
															
 
																  *	Create the codelets
															
 
																  */
															
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
																 {
															
 
																 	unsigned k, m, n;
															
 
																 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+	unsigned nn = size/nblocks;
															
 
																 	for (k = 0; k < nblocks; k++)
															
 
																 	{
															
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
																 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																 				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																 				       STARPU_RW, data_handles[k][k],
															
 
																+				       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
															
 
																 				       0);
															
 
																 		for (m = k+1; m<nblocks; m++)
															
@@ -88,28 +148,30 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 
																 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																 					       STARPU_R, data_handles[k][k],
															
 
																 					       STARPU_RW, data_handles[m][k],
															
 
																+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 					       0);
															
 
																 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
															
 
																 			if (my_distrib(k, k, nodes) == rank)
															
 
																 				starpu_data_wont_use(data_handles[k][k]);
															
 
																+		}
															
 
																-			for (n = k+1; n<nblocks; n++)
															
 
																+		for (n = k+1; n<nblocks; n++)
															
 
																+		{
															
 
																+			for (m = n; m<nblocks; m++)
															
 
																 			{
															
 
																-				if (n <= m)
															
 
																-				{
															
 
																-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																-							       STARPU_R, data_handles[n][k],
															
 
																-							       STARPU_R, data_handles[m][k],
															
 
																-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																-							       0);
															
 
																-				}
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[n][k],
															
 
																+						       STARPU_R, data_handles[m][k],
															
 
																+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																+						       0);
															
 
																 			}
															
 
																-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
															
 
																-			if (my_distrib(m, k, nodes) == rank)
															
 
																-				starpu_data_wont_use(data_handles[m][k]);
															
 
																+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
															
 
																+			if (my_distrib(n, k, nodes) == rank)
															
 
																+				starpu_data_wont_use(data_handles[n][k]);
															
 
																 		}
															
 
																 		starpu_iteration_pop();
															
 
																 	}
															
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
																 {
															
 
																 	unsigned k, m, n;
															
 
																 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+	unsigned nn = size/nblocks;
															
 
																 	/* Column */
															
 
																 	for (n = 0; n<nblocks; n++)
															
@@ -137,7 +200,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
																 						       STARPU_R, data_handles[n][k],
															
 
																 						       STARPU_R, data_handles[m][k],
															
 
																 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																 						       0);
															
 
																+
															
 
																+				if (m == n)
															
 
																+				{
															
 
																+					/* Nobody else will need it */
															
 
																+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
															
 
																+					starpu_data_wont_use(data_handles[m][k]);
															
 
																+				}
															
 
																 			}
															
 
																 			k = n;
															
 
																 			if (m > n)
															
@@ -147,6 +218,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																 						       STARPU_R, data_handles[k][k],
															
 
																 						       STARPU_RW, data_handles[m][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 						       0);
															
 
																 			}
															
 
																 			else
															
@@ -155,26 +227,27 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 
																 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																 						       STARPU_RW, data_handles[k][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
															
 
																 						       0);
															
 
																 			}
															
 
																+
															
 
																 		}
															
 
																+		/* We won't need it any more */
															
 
																+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
															
 
																+		starpu_data_wont_use(data_handles[n][n]);
															
 
																+
															
 
																 		starpu_iteration_pop();
															
 
																 	}
															
 
																-
															
 
																-	/* Submit flushes, StarPU will fit them according to the progress */
															
 
																-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
															
 
																-	for (m = 0; m < nblocks; m++)
															
 
																-		for (n = 0; n < nblocks ; n++)
															
 
																-			starpu_data_wont_use(data_handles[m][n]);
															
 
																 }
															
 
																 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
															
 
																 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
															
 
																 {
															
 
																-	unsigned a, c;
															
 
																+	unsigned a;
															
 
																 	unsigned k, m, n;
															
 
																 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+	unsigned nn = size/nblocks;
															
 
																 	/* double-antidiagonal number:
															
 
																 	 * - a=0 contains (0,0) plus (1,0)
															
@@ -205,7 +278,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
																 						       STARPU_R, data_handles[n][k],
															
 
																 						       STARPU_R, data_handles[m][k],
															
 
																 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																 						       0);
															
 
																+
															
 
																+				if (m == nblocks-1)
															
 
																+				{
															
 
																+					/* Nobody else will need it */
															
 
																+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
															
 
																+					starpu_data_wont_use(data_handles[n][k]);
															
 
																+				}
															
 
																 			}
															
 
																 			/* k = n */
															
@@ -216,6 +297,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																 						       STARPU_R, data_handles[k][k],
															
 
																 						       STARPU_RW, data_handles[m][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 						       0);
															
 
																 			}
															
 
																 			else
															
@@ -224,8 +306,16 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
																 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																 						       STARPU_RW, data_handles[k][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
															
 
																 						       0);
															
 
																 			}
															
 
																+
															
 
																+			if (m == nblocks - 1)
															
 
																+			{
															
 
																+				/* We do not need the potrf result any more */
															
 
																+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
															
 
																+				starpu_data_wont_use(data_handles[n][n]);
															
 
																+			}
															
 
																 		}
															
 
																 		/* column within second antidiagonal for a */
															
@@ -246,7 +336,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
																 						       STARPU_R, data_handles[n][k],
															
 
																 						       STARPU_R, data_handles[m][k],
															
 
																 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																 						       0);
															
 
																+
															
 
																+				if (m == nblocks-1)
															
 
																+				{
															
 
																+					/* Nobody else will need it */
															
 
																+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
															
 
																+					starpu_data_wont_use(data_handles[n][k]);
															
 
																+				}
															
 
																 			}
															
 
																 			/* non-diagonal block, solve */
															
 
																 			k = n;
															
@@ -254,17 +352,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 
																 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																 					       STARPU_R, data_handles[k][k],
															
 
																 					       STARPU_RW, data_handles[m][k],
															
 
																+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 					       0);
															
 
																+
															
 
																+			if (m == nblocks - 1)
															
 
																+			{
															
 
																+				/* We do not need the potrf result any more */
															
 
																+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
															
 
																+				starpu_data_wont_use(data_handles[n][n]);
															
 
																+			}
															
 
																 		}
															
 
																 		starpu_iteration_pop();
															
 
																 	}
															
 
																-
															
 
																-	/* Submit flushes, StarPU will fit them according to the progress */
															
 
																-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
															
 
																-	for (m = 0; m < nblocks; m++)
															
 
																-		for (n = 0; n < nblocks ; n++)
															
 
																-			starpu_data_wont_use(data_handles[m][n]);
															
 
																 }
															
 
																 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
															
@@ -273,9 +373,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
																 	unsigned a;
															
 
																 	int k, m, n;
															
 
																 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+	unsigned nn = size/nblocks;
															
 
																 	/*
															
 
																-	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
															
 
																+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
															
 
																 	 * double-antidiagonal number:
															
 
																 	 * - a=0 contains (0,0) plus (1,0)
															
 
																 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
															
@@ -285,41 +386,47 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
																 	{
															
 
																 		starpu_iteration_push(a);
															
 
																-		for (k = 0; k < nblocks; k++)
															
 
																+		for (k = 0; k < (int) nblocks; k++)
															
 
																 		{
															
 
																 			n = k;
															
 
																 			/* Should be m = a-k-n; for potrf and trsm to respect
															
 
																 			   priorities, but needs to be this for dependencies */
															
 
																 			m = a-2*k-n;
															
 
																-			if (m < 0 || m >= nblocks)
															
 
																-				continue;
															
 
																-
															
 
																 			if (m == n)
															
 
																 			{
															
 
																 				/* diagonal block, factorize */
															
 
																 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																 						       STARPU_RW, data_handles[k][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
															
 
																 						       0);
															
 
																 			}
															
 
																-			else
															
 
																+			else if (m >= n && m < (int) nblocks)
															
 
																 			{
															
 
																 				/* non-diagonal block, solve */
															
 
																 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																 						       STARPU_R, data_handles[k][k],
															
 
																 						       STARPU_RW, data_handles[m][k],
															
 
																+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 						       0);
															
 
																 			}
															
 
																+			if (m == (int) nblocks - 1)
															
 
																+			{
															
 
																+				/* We do not need the potrf result any more */
															
 
																+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
															
 
																+				starpu_data_wont_use(data_handles[n][n]);
															
 
																+			}
															
 
																+
															
 
																 			/* column within antidiagonal for a */
															
 
																-			for (n = k + 1; n < nblocks; n++)
															
 
																+			for (n = k + 1; n < (int) nblocks; n++)
															
 
																 			{
															
 
																 				/* row */
															
 
																 				m = a-2*k-n;
															
 
																-				if (m >= n && m < nblocks)
															
 
																+				if (m >= n && m < (int) nblocks)
															
 
																 				{
															
 
																 					/* Update */
															
 
																 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
@@ -327,7 +434,14 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
																 							       STARPU_R, data_handles[n][k],
															
 
																 							       STARPU_R, data_handles[m][k],
															
 
																 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+							       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																 							       0);
															
 
																+					if (m == (int) nblocks - 1)
															
 
																+					{
															
 
																+						/* Nobody else will need it */
															
 
																+						starpu_data_wont_use(data_handles[n][k]);
															
 
																+						starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
															
 
																+					}
															
 
																 				}
															
 
																 			}
															
@@ -335,12 +449,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
																 		starpu_iteration_pop();
															
 
																 	}
															
 
																-
															
 
																-	/* Submit flushes, StarPU will fit them according to the progress */
															
 
																-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
															
 
																-	for (m = 0; m < nblocks; m++)
															
 
																-		for (n = 0; n < nblocks ; n++)
															
 
																-			starpu_data_wont_use(data_handles[m][n]);
															
 
																 }
															
 
																 /*
															
@@ -423,7 +531,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
																 	if (rank == 0)
															
 
																 	{
															
 
																 		*timing = end - start;
															
 
																-		*flops = (1.0f*size*size*size)/3.0f;
															
 
																+		*flops = FLOPS_SPOTRF(size);
															
 
																 	}
															
 
																 }
															
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -56,63 +56,66 @@ void parse_args(int argc, char **argv, int nodes)
 
																                         size = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-dblockx") == 0)
															
 
																+                else if (strcmp(argv[i], "-dblockx") == 0)
															
 
																                 {
															
 
																                         char *argptr;
															
 
																                         dblockx = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-dblocky") == 0)
															
 
																+                else if (strcmp(argv[i], "-dblocky") == 0)
															
 
																                 {
															
 
																                         char *argptr;
															
 
																                         dblocky = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-nblocks") == 0)
															
 
																+                else if (strcmp(argv[i], "-nblocks") == 0)
															
 
																                 {
															
 
																                         char *argptr;
															
 
																                         nblocks = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-nbigblocks") == 0)
															
 
																+                else if (strcmp(argv[i], "-nbigblocks") == 0)
															
 
																                 {
															
 
																                         char *argptr;
															
 
																                         nbigblocks = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-columns") == 0)
															
 
																+                else if (strcmp(argv[i], "-columns") == 0)
															
 
																                 {
															
 
																                         submission = COLUMNS;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-antidiagonals") == 0)
															
 
																+                else if (strcmp(argv[i], "-antidiagonals") == 0)
															
 
																                 {
															
 
																                         submission = ANTIDIAGONALS;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-prios") == 0)
															
 
																+                else if (strcmp(argv[i], "-prios") == 0)
															
 
																                 {
															
 
																                         submission = PRIOS;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-no-prio") == 0)
															
 
																+                else if (strcmp(argv[i], "-no-prio") == 0)
															
 
																                 {
															
 
																                         noprio = 1;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-check") == 0)
															
 
																+                else if (strcmp(argv[i], "-check") == 0)
															
 
																                 {
															
 
																                         check = 1;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-display") == 0)
															
 
																+                else if (strcmp(argv[i], "-display") == 0)
															
 
																                 {
															
 
																                         display = 1;
															
 
																                 }
															
 
																-                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
															
 
																+                else
															
 
																+                /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) */
															
 
																                 {
															
 
																                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
															
 
																+                        fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
															
 
																+                        exit(0);
															
 
																                 }
															
 
																         }
															
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -37,8 +37,8 @@
 
																 static unsigned long size = 4096;
															
 
																 static unsigned nblocks = 16;
															
 
																 static unsigned check = 0;
															
 
																-static int p = 1;
															
 
																-static int q = 1;
															
 
																+static int p = -1;
															
 
																+static int q = -1;
															
 
																 static unsigned display = 0;
															
 
																 static unsigned no_prio = 0;
															
@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 
																 	int rank;
															
 
																 	int world_size;
															
 
																 	int ret;
															
 
																+	unsigned i, j;
															
 
																 	/*
															
 
																 	 *	Initialization
															
@@ -462,7 +463,14 @@ int main(int argc, char **argv)
 
																 	/* We disable sequential consistency in this example */
															
 
																 	starpu_data_set_default_sequential_consistency_flag(0);
															
 
																-	STARPU_ASSERT(p*q == world_size);
															
 
																+	if (p == -1 && q==-1)
															
 
																+	{
															
 
																+		fprintf(stderr, "Setting default values for p and q\n");
															
 
																+		p = (q % 2 == 0) ? 2 : 1;
															
 
																+		q = world_size / p;
															
 
																+
															
 
																+	}
															
 
																+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
															
 
																 	starpu_cublas_init();
															
@@ -594,6 +602,18 @@ int main(int argc, char **argv)
 
																 	/*
															
 
																 	 * 	Termination
															
 
																 	 */
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nblocks; i++)
															
 
																+		{
															
 
																+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
															
 
																+			TYPE *blockptr = dataA[j+i*nblocks];
															
 
																+			if (blockptr != STARPU_POISON_PTR)
															
 
																+				starpu_free(blockptr);
															
 
																+		}
															
 
																+	}
															
 
																+	free(dataA_handles);
															
 
																+	free(dataA);
															
 
																 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
--- a/mpi/examples/mpi_lu/plu_implicit_example.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example.c
@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 
																 	int rank;
															
 
																 	int world_size;
															
 
																 	int ret;
															
 
																+	unsigned i, j;
															
 
																 	starpu_srand48((long int)time(NULL));
															
@@ -376,6 +377,18 @@ int main(int argc, char **argv)
 
																 	/*
															
 
																 	 * 	Termination
															
 
																 	 */
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nblocks; i++)
															
 
																+		{
															
 
																+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
															
 
																+			TYPE *blockptr = dataA[j+i*nblocks];
															
 
																+			if (blockptr != STARPU_POISON_PTR)
															
 
																+				starpu_free(blockptr);
															
 
																+		}
															
 
																+	}
															
 
																+	free(dataA_handles);
															
 
																+	free(dataA);
															
 
																 	starpu_cublas_shutdown();
															
 
																 	starpu_mpi_shutdown();
															
--- a/mpi/examples/mpi_lu/plu_outofcore_example.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example.c
@@ -39,9 +39,10 @@
 
																 static unsigned long size = 4096;
															
 
																 static unsigned nblocks = 16;
															
 
																+static size_t blocksize;
															
 
																 static unsigned check = 0;
															
 
																-static int p = 1;
															
 
																-static int q = 1;
															
 
																+static int p = -1;
															
 
																+static int q = -1;
															
 
																 static unsigned display = 0;
															
 
																 static unsigned no_prio = 0;
															
 
																 static char *path = "./starpu-ooc-files";
															
@@ -53,6 +54,9 @@ static unsigned numa = 0;
 
																 static size_t allocated_memory = 0;
															
 
																 static starpu_data_handle_t *dataA_handles;
															
 
																+static void **disk_objs;
															
 
																+
															
 
																+static int disk_node;
															
 
																 int get_block_rank(unsigned i, unsigned j);
															
@@ -142,7 +146,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnbl
 
																 static void create_matrix()
															
 
																 {
															
 
																-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
															
 
																 	TYPE *blockptr = malloc(blocksize);
															
 
																 	int fd;
															
 
																 	char *filename;
															
@@ -195,10 +198,9 @@ static void init_matrix(int rank)
 
																 {
															
 
																 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
															
 
																 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
															
 
																+	disk_objs = calloc(nblocks*nblocks, sizeof(*disk_objs));
															
 
																-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
															
 
																-
															
 
																-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
															
 
																+	disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(16*1024*1024, size*size*sizeof(TYPE)));
															
 
																 	assert(disk_node >= 0);
															
 
																 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
															
@@ -215,21 +217,21 @@ static void init_matrix(int rank)
 
																 			if (block_rank == rank)
															
 
																 			{
															
 
																-				void *disk_obj;
															
 
																 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
															
 
																 				/* Register it to StarPU */
															
 
																-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
															
 
																-				if (!disk_obj)
															
 
																+				disk_objs[j+nblocks*i] = starpu_disk_open(disk_node, filename, blocksize);
															
 
																+				if (!disk_objs[j+nblocks*i])
															
 
																 				{
															
 
																 					fprintf(stderr,"could not open %s\n", filename);
															
 
																 					exit(1);
															
 
																 				}
															
 
																 				starpu_matrix_data_register(handleptr, disk_node,
															
 
																-					(uintptr_t) disk_obj, size/nblocks,
															
 
																+					(uintptr_t) disk_objs[j+nblocks*i], size/nblocks,
															
 
																 					size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																 			}
															
 
																 			else
															
 
																 			{
															
 
																+				disk_objs[j+nblocks*i] = NULL;
															
 
																 				starpu_matrix_data_register(handleptr, -1,
															
 
																 					0, size/nblocks,
															
 
																 					size/nblocks, size/nblocks, sizeof(TYPE));
															
@@ -273,6 +275,8 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																+	blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
															
 
																+
															
 
																 	ret = mkdir(path, 0777);
															
 
																 	if (ret != 0 && errno != EEXIST)
															
 
																 	{
															
@@ -286,7 +290,14 @@ int main(int argc, char **argv)
 
																 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
															
 
																-	STARPU_ASSERT(p*q == world_size);
															
 
																+	if (p == -1 && q==-1)
															
 
																+	{
															
 
																+		fprintf(stderr, "Setting default values for p and q\n");
															
 
																+		p = (q % 2 == 0) ? 2 : 1;
															
 
																+		q = world_size / p;
															
 
																+
															
 
																+	}
															
 
																+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
															
 
																 	starpu_cublas_init();
															
@@ -401,8 +412,12 @@ int main(int argc, char **argv)
 
																 		for (i = 0; i < nblocks; i++)
															
 
																 		{
															
 
																 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
															
 
																+			if (disk_objs[j+nblocks*i])
															
 
																+				starpu_disk_close(disk_node, disk_objs[j+nblocks*i], blocksize);
															
 
																 		}
															
 
																 	}
															
 
																+	free(dataA_handles);
															
 
																+	free(disk_objs);
															
 
																 	starpu_cublas_shutdown();
															
 
																 	starpu_mpi_shutdown();
															
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -203,7 +203,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 			else
															
 
																 			{
															
 
																 				STARPU_ASSERT(req->count);
															
 
																-				_STARPU_MPI_MALLOC(req->ptr, req->count);
															
 
																+				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
															
 
																 			}
															
 
																 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
															
@@ -225,12 +225,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
															
 
																 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
															
 
																-			/* Case: a receive request for a data with the given tag and source has already been
															
 
																-			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
															
 
																-			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
															
 
																-			 * will be called to bring the data back to the original data handle associated to the request.*/
															
 
																 			if (early_data_handle)
															
 
																 			{
															
 
																+				/* Case: a receive request for a data with the given tag and source has already been
															
 
																+				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
															
 
																+				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
															
 
																+				 * will be called to bring the data back to the original data handle associated to the request.*/
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
															
 
																 				while (!(early_data_handle->req_ready))
															
@@ -254,16 +254,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
															
 
																+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
															
 
																 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 			}
															
 
																-			/* Case: no matching data has been received. Store the receive request as an early_request. */
															
 
																 			else
															
 
																 			{
															
 
																 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
															
 
																 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
															
 
																 				if (sync_req)
															
 
																 				{
															
 
																+					/* Case: we already received the send envelope, we can proceed with the receive */
															
 
																 					req->sync = 1;
															
 
																 					_starpu_mpi_datatype_allocate(req->data_handle, req);
															
 
																 					if (req->registered_datatype == 1)
															
@@ -275,14 +275,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 					{
															
 
																 						req->count = sync_req->count;
															
 
																 						STARPU_ASSERT(req->count);
															
 
																-						_STARPU_MPI_MALLOC(req->ptr, req->count);
															
 
																+						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
															
 
																 					}
															
 
																 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
															
 
																 					_STARPU_MPI_INC_READY_REQUESTS(+1);
															
 
																+					/* Throw away the dumb request that was only used to know that we got the envelope */
															
 
																 					_starpu_mpi_request_destroy(sync_req);
															
 
																 				}
															
 
																 				else
															
 
																 				{
															
 
																+					/* Case: no matching data has been received. Store the receive request as an early_request. */
															
 
																 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
															
 
																 					_starpu_mpi_early_request_enqueue(req);
															
 
																 				}
															
@@ -684,6 +686,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
															
 
																+	STARPU_VALGRIND_YIELD();
															
 
																+
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
															
 
																 	if (*flag)
															
@@ -908,6 +912,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
																 	_STARPU_MPI_LOG_OUT();
															
 
																 }
															
 
																+/* This is called when the data is now received in the early data handle, we can
															
 
																+ * now copy it over to the real handle. */
															
 
																 static void _starpu_mpi_early_data_cb(void* arg)
															
 
																 {
															
 
																 	struct _starpu_mpi_early_data_cb_args *args = arg;
															
@@ -954,7 +960,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 
																 	}
															
 
																 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
															
 
																-	starpu_data_release(args->early_handle);
															
 
																+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
															
 
																 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
															
 
																 	/* XXX: note that we have already freed the registered buffer above. In
															
@@ -1101,8 +1107,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 
																 		_starpu_mpi_req_list_push_back(&detached_requests, req);
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
															
 
																-		starpu_wake_all_blocked_workers();
															
 
																-
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
@@ -1204,14 +1208,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
															
 
																 	}
															
 
																-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
															
 
																+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
															
 
																 	{
															
 
																 		char hostname[65];
															
 
																 		gethostname(hostname, sizeof(hostname));
															
 
																 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
															
 
																 	}
															
 
																 	_starpu_mpi_do_initialize(argc_argv);
															
 
																-	if (_starpu_mpi_thread_cpuid >= 0)
															
 
																+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid >= 0)
															
 
																 		/* In case MPI changed the binding */
															
 
																 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
															
 
																 #else
															
@@ -1450,7 +1454,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 						else
															
 
																 						{
															
 
																 							early_request->count = envelope->size;
															
 
																-							_STARPU_MPI_MALLOC(early_request->ptr, early_request->count);
															
 
																+							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
															
 
																 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
															
 
																 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);
															
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -245,6 +245,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
															
 
																 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
															
 
																+	STARPU_VALGRIND_YIELD();
															
 
																+
															
 
																 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
															
 
																 	/* we must do a test_locked to avoid race condition :
															
@@ -344,7 +346,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 
																 				// req->ptr is freed by starpu_data_unpack
															
 
																 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
															
 
																 			else
															
 
																-				free(req->ptr);
															
 
																+				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
@@ -451,7 +453,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
															
 
																+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
															
 
																 	{
															
 
																 		char hostname[65];
															
 
																 		gethostname(hostname, sizeof(hostname));
															
@@ -623,7 +625,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
																 	 * required for piom_ltask_set_bound_thread_indexes() */
															
 
																 	_starpu_mpi_do_initialize(argc_argv);
															
 
																-	if (_starpu_mpi_thread_cpuid < 0)
															
 
																+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid < 0)
															
 
																 	{
															
 
																 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
															
 
																 	}
															
@@ -633,7 +635,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
																 	/* Tell pioman to use a bound thread for communication progression:
															
 
																 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
															
 
																 	int indexes[1] = { _starpu_mpi_thread_cpuid };
															
 
																-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
															
 
																+	if (!_starpu_mpi_nobind)
															
 
																+		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
															
 
																 	/* Register some hooks for communication progress if needed */
															
 
																 	int polling_point_prog, polling_point_idle;
															
--- a/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
+++ b/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
@@ -130,7 +130,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 
																 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
															
 
																 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
															
 
																-		req->ptr = malloc(req->count);
															
 
																+		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
															
 
																 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
															
 
																 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);
															
--- a/mpi/src/starpu_mpi_coop_sends.c
+++ b/mpi/src/starpu_mpi_coop_sends.c
@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 
																 			/* We were last, release data */
															
 
																 			free(coop_sends->reqs_array);
															
 
																 			free(coop_sends);
															
 
																-			starpu_data_release(req->data_handle);
															
 
																+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
															
 
																 		}
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		/* Trivial request */
															
 
																-		starpu_data_release(req->data_handle);
															
 
																+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
															
 
																 	}
															
 
																 }
															
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -72,7 +72,7 @@ extern "C"
 
																 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
															
 
																 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
															
 
																 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
															
 
																-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
															
 
																+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
															
 
																 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
															
 
																 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
															
 
																 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\
															
--- a/mpi/src/starpu_mpi_private.c
+++ b/mpi/src/starpu_mpi_private.c
@@ -22,6 +22,7 @@ int _starpu_debug_level_max=0;
 
																 int _starpu_mpi_tag = 42;
															
 
																 int _starpu_mpi_comm_debug;
															
 
																+int _starpu_mpi_nobind = -1;
															
 
																 int _starpu_mpi_thread_cpuid = -1;
															
 
																 int _starpu_mpi_use_prio = 1;
															
 
																 int _starpu_mpi_fake_world_size = -1;
															
@@ -62,6 +63,7 @@ void _starpu_mpi_env_init(void)
 
																         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
															
 
																 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
															
 
																 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
															
 
																+	_starpu_mpi_nobind = starpu_get_env_number_default("STARPU_MPI_NOBIND", 0);
															
 
																 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
															
 
																 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
															
 
																 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);
															
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -61,6 +61,7 @@ void _starpu_mpi_set_debug_level_max(int level);
 
																 extern int _starpu_mpi_fake_world_size;
															
 
																 extern int _starpu_mpi_fake_world_rank;
															
 
																 extern int _starpu_mpi_use_prio;
															
 
																+extern int _starpu_mpi_nobind;
															
 
																 extern int _starpu_mpi_thread_cpuid;
															
 
																 extern int _starpu_mpi_use_coop_sends;
															
 
																 extern int _starpu_mpi_mem_throttle;
															
@@ -200,7 +201,7 @@ struct _starpu_mpi_data
 
																 {
															
 
																 	int magic;
															
 
																 	struct _starpu_mpi_node_tag node_tag;
															
 
																-	int *cache_sent;
															
 
																+	char *cache_sent;
															
 
																 	int cache_received;
															
 
																 	/** Rendez-vous data for opportunistic cooperative sends */
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -45,10 +45,10 @@ endif
 
																 endif
															
 
																 if STARPU_HAVE_AM111
															
 
																-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
															
 
																 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
															
 
																 else
															
 
																-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
															
 
																+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
															
 
																 endif
															
 
																 if STARPU_MPI_CHECK
															
--- a/mpi/tests/broadcast.c
+++ b/mpi/tests/broadcast.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv)
 
																 {
															
 
																 	int ret, rank, size;
															
 
																 	starpu_data_handle_t handle;
															
 
																-	int var;
															
 
																+	int var=-1;
															
 
																 	int mpi_init;
															
 
																 	MPI_Status status;
															
--- a/mpi/tests/early_request.c
+++ b/mpi/tests/early_request.c
@@ -109,6 +109,18 @@ void submitted_order_fun(void *buffers[], void *cl_arg)
 
																 	(void)cl_arg;
															
 
																 }
															
 
																+static struct starpu_codelet submitted_order_rw =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {submitted_order_fun, NULL},
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_RW},
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+	.model = &starpu_perfmodel_nop,
															
 
																+#endif
															
 
																+	.name = "submitted_order_enforcer"
															
 
																+};
															
 
																+
															
 
																 static struct starpu_codelet submitted_order =
															
 
																 {
															
 
																 	.where = STARPU_CPU,
															
@@ -156,11 +168,15 @@ void insert_work_for_one_element(struct element *el)
 
																 			   STARPU_W,tmp_send,
															
 
																 			   0);
															
 
																 	//Send operation
															
 
																-	starpu_insert_task(&submitted_order,
															
 
																+	starpu_insert_task(&submitted_order_rw,
															
 
																 			   STARPU_RW,el->ensure_submitted_order_send,
															
 
																-			   STARPU_W,tmp_send,
															
 
																+			   STARPU_RW,tmp_send,
															
 
																 			   0);
															
 
																 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
															
 
																+	starpu_insert_task(&submitted_order_rw,
															
 
																+			   STARPU_RW,el->ensure_submitted_order_send,
															
 
																+			   STARPU_RW,tmp_send,
															
 
																+			   0);
															
 
																 	//Recv operation for current element
															
 
																 	starpu_insert_task(&submitted_order,
															
--- a/mpi/tests/insert_task_compute.c
+++ b/mpi/tests/insert_task_compute.c
@@ -47,7 +47,7 @@ int test(int rank, int node, int *before, int *after, int task_insert, int data_
 
																 	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																-	if (starpu_cpu_worker_get_count() <= 0)
															
 
																+	if (starpu_cpu_worker_get_count() == 0)
															
 
																 	{
															
 
																 		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
															
 
																 		ret = -ENODEV;
															
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -43,13 +43,14 @@ int main(int argc, char **argv)
 
																 {
															
 
																 	int ret, rank, size;
															
 
																 	int mpi_init;
															
 
																+	int i;
															
 
																 	int niter = DEFAULT_NITER;
															
 
																 	int data_size = DEFAULT_DATA_SIZE;
															
 
																 	int sleep_time = DEFAULT_SLEEP_TIME;
															
 
																 	int method = DEFAULT_METHOD;
															
 
																-	for (int i = 1; i < argc; i++)
															
 
																+	for (i = 1; i < argc; i++)
															
 
																 	{
															
 
																 		if (strcmp(argv[i], "-n") == 0)
															
 
																 		{
															
@@ -134,6 +135,7 @@ int main(int argc, char **argv)
 
																 	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	int sender;
															
 
																+	int r = 0;
															
 
																 	if (method == 0) // ping pongs
															
 
																 	{
															
@@ -161,7 +163,7 @@ int main(int argc, char **argv)
 
																 			sender = loop % size;
															
 
																 			if (sender == rank)
															
 
																 			{
															
 
																-				for (int r = 0; r < size; r++)
															
 
																+				for (r = 0; r < size; r++)
															
 
																 				{
															
 
																 					if (r != rank)
															
 
																 					{
															
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 
																 				MPI_Status status;
															
 
																 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
															
 
																-				for (int r = 0; r < (size-1); r++)
															
 
																+				for (r = 0; r < (size-1); r++)
															
 
																 					starpu_sleep(sleep_time / 1000);
															
 
																 			}
															
 
																 		}
															
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -27,6 +27,7 @@ unsigned long _starpu_job_cnt = 0;
 
																 #ifdef STARPU_USE_FXT
															
 
																 #include <common/fxt.h>
															
 
																 #include <starpu_fxt.h>
															
 
																+#include <sys/stat.h>
															
 
																 #ifdef STARPU_HAVE_WINDOWS
															
 
																 #include <windows.h>
															
@@ -95,6 +96,16 @@ static void _starpu_profile_set_tracefile(void)
 
																 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
															
 
																 	if (!fxt_prefix)
															
 
																 	     fxt_prefix = "/tmp/";
															
 
																+	else
															
 
																+	{
															
 
																+		// Check if the given folder really exists:
															
 
																+		struct stat folder_stat;
															
 
																+		if (stat(fxt_prefix, &folder_stat) < 0 || !S_ISDIR(folder_stat.st_mode))
															
 
																+		{
															
 
																+			_STARPU_MSG("%s is not a valid directory.\n", fxt_prefix);
															
 
																+			_starpu_abort();
															
 
																+		}
															
 
																+	}
															
 
																 	user = starpu_getenv("USER");
															
 
																 	if (!user)
															
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -85,8 +85,10 @@
 
																 #define _STARPU_UYIELD() ((void)0)
															
 
																 #endif
															
 
																 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
															
 
																+#define STARPU_VALGRIND_YIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); } while (0)
															
 
																 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
															
 
																 #else
															
 
																+#define STARPU_VALGRIND_YIELD() do { } while (0)
															
 
																 #define STARPU_UYIELD() _STARPU_UYIELD()
															
 
																 #endif
															
--- a/src/core/dependencies/cg.c
+++ b/src/core/dependencies/cg.c
@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 
																 	return n;
															
 
																 }
															
 
																-/* Note: in case of a tag, it must be already locked */
															
 
																 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
															
 
																 {
															
 
																 	STARPU_ASSERT(cg);
															
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 
																 				struct _starpu_tag *tag;
															
 
																 				tag = cg->succ.tag;
															
 
																+				_starpu_spin_lock(&tag->lock);
															
 
																 				tag_successors = &tag->tag_successors;
															
 
																 				tag_successors->ndeps_completed++;
															
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 
																 				{
															
 
																 					/* reset the counter so that we can reuse the completion group */
															
 
																 					tag_successors->ndeps_completed = 0;
															
 
																+					/* This releases the lock */
															
 
																 					_starpu_tag_set_ready(tag);
															
 
																-				}
															
 
																+				} else
															
 
																+					_starpu_spin_unlock(&tag->lock);
															
 
																 				break;
															
 
																 			}
															
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 
																 			successors->nsuccs--;
															
 
																 		}
															
 
																 		_starpu_spin_unlock(&successors->lock);
															
 
																-
															
 
																-		struct _starpu_tag *cgtag = NULL;
															
 
																-
															
 
																-		if (cg_type == STARPU_CG_TAG)
															
 
																-		{
															
 
																-			cgtag = cg->succ.tag;
															
 
																-			STARPU_ASSERT(cgtag);
															
 
																-			_starpu_spin_lock(&cgtag->lock);
															
 
																-		}
															
 
																-
															
 
																 		_starpu_notify_cg(pred, cg);
															
 
																-
															
 
																-		if (cg_type == STARPU_CG_TAG)
															
 
																-			_starpu_spin_unlock(&cgtag->lock);
															
 
																-
															
 
																 		_starpu_spin_lock(&successors->lock);
															
 
																 	}
															
 
																 	successors->terminated = 1;
															
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
																 }
															
 
																 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
															
 
																 #else // LOCK_OR_DELEGATE
															
 
																-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
															
 
																+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
															
 
																 #endif
															
 
																 {
															
 
																 	starpu_arbiter_t arbiter = handle->arbiter;
															
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
																 	{
															
 
																 		/* No waiter, just remove our reference */
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																-		STARPU_ASSERT(handle->refcnt > 0);
															
 
																-		handle->refcnt--;
															
 
																-		STARPU_ASSERT(handle->busy_count > 0);
															
 
																-		handle->busy_count--;
															
 
																+		if (down_to_mode == STARPU_NONE)
															
 
																+		{
															
 
																+			STARPU_ASSERT(handle->refcnt > 0);
															
 
																+			handle->refcnt--;
															
 
																+			STARPU_ASSERT(handle->busy_count > 0);
															
 
																+			handle->busy_count--;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			/* Downgrade from W or RW down to R, keeping the same reference,
															
 
																+			 * but thus allowing other readers without allowing writers.  */
															
 
																+			STARPU_ASSERT(down_to_mode == STARPU_R &&
															
 
																+				      handle->current_mode == STARPU_W);
															
 
																+			handle->current_mode = down_to_mode;
															
 
																+		}
															
 
																 #ifndef LOCK_OR_DELEGATE
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
															
 
																 #endif
															
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
																 	/* There is a waiter, remove our reference */
															
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																-	STARPU_ASSERT(handle->refcnt > 0);
															
 
																-	handle->refcnt--;
															
 
																-	STARPU_ASSERT(handle->busy_count > 0);
															
 
																-	handle->busy_count--;
															
 
																+	if (down_to_mode == STARPU_NONE)
															
 
																+	{
															
 
																+		STARPU_ASSERT(handle->refcnt > 0);
															
 
																+		handle->refcnt--;
															
 
																+		STARPU_ASSERT(handle->busy_count > 0);
															
 
																+		handle->busy_count--;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Downgrade from W or RW down to R, keeping the same reference,
															
 
																+		 * but thus allowing other readers without allowing writers.  */
															
 
																+		STARPU_ASSERT(down_to_mode == STARPU_R &&
															
 
																+			      handle->current_mode == STARPU_W);
															
 
																+		handle->current_mode = down_to_mode;
															
 
																+	}
															
 
																 	/* There should be at least one busy_count reference for the waiter
															
 
																 	 * (thus we don't risk to see the handle disappear below) */
															
 
																 	STARPU_ASSERT(handle->busy_count > 0);
															
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
 
																  * This may free the handle if it was lazily unregistered (1 is returned in
															
 
																  * that case). The handle pointer thus becomes invalid for the caller.
															
 
																  */
															
 
																-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
															
 
																+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
															
 
																 {
															
 
																 	_starpu_spin_checklocked(&handle->header_lock);
															
 
																+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
															
 
																+	{
															
 
																+		/* No change, nothing to do */
															
 
																+		return 0;
															
 
																+	}
															
 
																+
															
 
																 	if (handle->arbiter)
															
 
																 	{
															
 
																 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
															
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 
																 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
															
 
																-		_starpu_notify_arbitered_dependencies(handle);
															
 
																+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
															
 
																 		/* We have already unlocked */
															
 
																 		return 1;
															
 
																 	}
															
 
																-	/* A data access has finished so we remove a reference. */
															
 
																-	STARPU_ASSERT(handle->refcnt > 0);
															
 
																-	handle->refcnt--;
															
 
																-	STARPU_ASSERT(handle->busy_count > 0);
															
 
																-	handle->busy_count--;
															
 
																-	if (_starpu_data_check_not_busy(handle))
															
 
																-		/* Handle was destroyed, nothing left to do.  */
															
 
																-		return 1;
															
 
																-
															
 
																 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
															
 
																+	if (down_to_mode == STARPU_NONE)
															
 
																+	{
															
 
																+		/* A data access has finished so we remove a reference. */
															
 
																+		STARPU_ASSERT(handle->refcnt > 0);
															
 
																+		handle->refcnt--;
															
 
																+		STARPU_ASSERT(handle->busy_count > 0);
															
 
																+		handle->busy_count--;
															
 
																+		if (_starpu_data_check_not_busy(handle))
															
 
																+			/* Handle was destroyed, nothing left to do.  */
															
 
																+			return 1;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Downgrade from W or RW down to R, keeping the same reference,
															
 
																+		 * but thus allowing other readers without allowing writers.  */
															
 
																+		STARPU_ASSERT(down_to_mode == STARPU_R &&
															
 
																+				(handle->current_mode == STARPU_RW ||
															
 
																+				 handle->current_mode == STARPU_W));
															
 
																+		handle->current_mode = down_to_mode;
															
 
																+	}
															
 
																+
															
 
																 	/* In case there is a pending reduction, and that this is the last
															
 
																 	 * requester, we may go back to a "normal" coherency model. */
															
 
																 	if (handle->reduction_refcnt > 0)
															
--- a/src/core/dependencies/data_concurrency.h
+++ b/src/core/dependencies/data_concurrency.h
@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 
																 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
															
 
																 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
															
 
																-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
															
 
																-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
															
 
																+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
															
 
																+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
															
 
																 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
															
 
																 							  enum starpu_data_access_mode mode,
															
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -225,10 +225,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
																 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
															
 
																 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
															
 
																+		if (mode & STARPU_R)
															
 
																+			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
															
 
																+
															
 
																 		if (mode & STARPU_W || mode == STARPU_REDUX)
															
 
																 		{
															
 
																-			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handles can not be written to");
															
 
																+			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handle %p can not be written to", handle);
															
 
																 			handle->initialized = 1;
															
 
																 			/* We will change our value, disconnect from our readonly duplicates */
															
@@ -613,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
 
																         _STARPU_LOG_OUT();
															
 
																 }
															
 
																-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
															
 
																+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
															
 
																 {
															
 
																 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
															
 
																 	unsigned do_submit_tasks = 0;
															
 
																+	unsigned last_cnt;
															
 
																 	/* Here helgrind would shout that this is an unprotected access, but
															
 
																 	 * count can only be zero if we don't have to care about
															
 
																 	 * post_sync_tasks_cnt at all.  */
															
 
																-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
															
 
																+	if (handle->post_sync_tasks_cnt)
															
 
																 	{
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
															
 
																-		if (--handle->post_sync_tasks_cnt == 0)
															
 
																+		last_cnt = handle->post_sync_tasks_cnt;
															
 
																+
															
 
																+		if (mode == STARPU_NONE)
															
 
																+			/* Last release from us */
															
 
																+			handle->post_sync_tasks_cnt--;
															
 
																+
															
 
																+		if (last_cnt == 1)
															
 
																 		{
															
 
																 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
															
 
																 			do_submit_tasks = 1;
															
--- a/src/core/dependencies/implicit_data_deps.h
+++ b/src/core/dependencies/implicit_data_deps.h
@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 
																 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
															
 
																 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
															
 
																-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
															
 
																+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
															
 
																 /** Register a hook to be called when a write is submitted */
															
 
																 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));
															
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 
																 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
															
 
																 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
															
 
																+			{
															
 
																 				/* Last tag this cg depends on, cg becomes unreferenced */
															
 
																+#ifdef STARPU_DEBUG
															
 
																+				free(cg->deps);
															
 
																+				free(cg->done);
															
 
																+#endif
															
 
																 				free(cg);
															
 
																+			}
															
 
																 		}
															
 
																 #ifdef STARPU_DYNAMIC_DEPS_SIZE
															
 
																 		free(tag->tag_successors.succ);
															
 
																 #endif
															
 
																+#ifdef STARPU_DEBUG
															
 
																+		free(tag->tag_successors.deps);
															
 
																+		free(tag->tag_successors.done);
															
 
																+#endif
															
 
																 		_starpu_spin_unlock(&tag->lock);
															
 
																 		_starpu_spin_destroy(&tag->lock);
															
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 
																 	return tag;
															
 
																 }
															
 
																-/* lock should be taken */
															
 
																+/* lock should be taken, and this releases it */
															
 
																 void _starpu_tag_set_ready(struct _starpu_tag *tag)
															
 
																 {
															
 
																 	/* mark this tag as ready to run */
															
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 
																 	/* declare it to the scheduler ! */
															
 
																 	struct _starpu_job *j = tag->job;
															
 
																+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
															
 
																+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
															
 
																+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
															
 
																+
															
 
																 	/* In case the task job is going to be scheduled immediately, and if
															
 
																 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
															
 
																 	 * the dependencies of the task, and therefore it would try to grab the
															
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 
																 	/* enforce data dependencies */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
															
 
																 	_starpu_enforce_deps_starting_from_task(j);
															
 
																-
															
 
																-	_starpu_spin_lock(&tag->lock);
															
 
																-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
															
 
																-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
															
 
																-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
															
 
																 }
															
 
																-/* the lock must be taken ! */
															
 
																+/* the lock of the tag must already be taken ! */
															
 
																 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
															
 
																 {
															
 
																 	STARPU_ASSERT(tag);
															
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 
																 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
															
 
																 		STARPU_ASSERT(tag_dep != tag_child);
															
 
																 		_starpu_spin_lock(&tag_dep->lock);
															
 
																-		_starpu_spin_lock(&tag_child->lock);
															
 
																 		_starpu_tag_add_succ(tag_dep, cg);
															
 
																 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
															
 
																 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
															
 
																 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
															
 
																-		_starpu_spin_unlock(&tag_child->lock);
															
 
																 		_starpu_spin_unlock(&tag_dep->lock);
															
 
																 	}
															
 
																 }
															
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 
																 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
															
 
																 		STARPU_ASSERT(tag_dep != tag_child);
															
 
																 		_starpu_spin_lock(&tag_dep->lock);
															
 
																-		_starpu_spin_lock(&tag_child->lock);
															
 
																 		_starpu_tag_add_succ(tag_dep, cg);
															
 
																 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
															
 
																 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
															
 
																 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
															
 
																-		_starpu_spin_unlock(&tag_child->lock);
															
 
																 		_starpu_spin_unlock(&tag_dep->lock);
															
 
																 	}
															
 
																 	va_end(pa);
															
--- a/src/core/dependencies/tags.h
+++ b/src/core/dependencies/tags.h
@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 
																 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
															
 
																 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
															
 
																+
															
 
																+/* lock should be taken, and this releases it */
															
 
																 void _starpu_tag_set_ready(struct _starpu_tag *tag);
															
 
																 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
																 	job->task = task;
															
 
																-#ifndef STARPU_USE_FXT
															
 
																+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
															
 
																 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
															
 
																 #endif
															
 
																 	{
															
--- a/src/core/perfmodel/multiple_regression.c
+++ b/src/core/perfmodel/multiple_regression.c
@@ -236,6 +236,9 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, unsigned long nn, unsigne
 
																 	if( info != 0 )
															
 
																 	{
															
 
																 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
															
 
																+		free(X);
															
 
																+		free(Y);
															
 
																+		free(work);
															
 
																 		return 1;
															
 
																 	}
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -862,7 +862,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
																 /* in case the data was accessed on a write mode, do not forget to
															
 
																  * make it accessible again once it is possible ! */
															
 
																-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
															
 
																+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
															
 
																 {
															
 
																 	uint32_t wt_mask;
															
 
																 	wt_mask = default_wt_mask | handle->wt_mask;
															
@@ -887,14 +887,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
																 	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																-	/* Release refcnt taken by fetch_data_on_node */
															
 
																-	replicate->refcnt--;
															
 
																-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
															
 
																+	if (down_to_mode == STARPU_NONE)
															
 
																+	{
															
 
																+		/* Release refcnt taken by fetch_data_on_node */
															
 
																+		replicate->refcnt--;
															
 
																+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
															
 
																-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
															
 
																-	handle->busy_count--;
															
 
																+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
															
 
																+		handle->busy_count--;
															
 
																+	}
															
 
																-	if (!_starpu_notify_data_dependencies(handle))
															
 
																+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
@@ -1229,7 +1232,7 @@ enomem:
 
																 		local_replicate = get_replicate(handle, mode, workerid, node);
															
 
																-		_starpu_release_data_on_node(handle, 0, local_replicate);
															
 
																+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
															
 
																 	}
															
 
																 	return -1;
															
@@ -1338,13 +1341,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 
																 		if (node == -1)
															
 
																 		{
															
 
																 			/* NOWHERE case, just notify dependencies */
															
 
																-			if (!_starpu_notify_data_dependencies(handle))
															
 
																+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
															
 
																 				_starpu_spin_unlock(&handle->header_lock);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																-			_starpu_release_data_on_node(handle, 0, local_replicate);
															
 
																+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
															
 
																 		}
															
 
																 	}
															
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
															
 
																 /** This releases a reference on the handle */
															
 
																 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
															
 
																+				  enum starpu_data_access_mode down_to_mode,
															
 
																 				  struct _starpu_data_replicate *replicate);
															
 
																 void _starpu_update_data_state(starpu_data_handle_t handle,
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -375,8 +375,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 
																 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
															
 
																 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
															
 
																-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
															
 
																-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
															
 
																+	if (ld2_src == blocksize * numblocks_1 &&
															
 
																+	    ld2_dst == blocksize * numblocks_1)
															
 
																 		/* Optimize contiguous case */
															
 
																 		return starpu_interface_copy(src, src_offset, src_node,
															
 
																 					     dst, dst_offset, dst_node,
															
@@ -425,8 +425,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 
																 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
															
 
																 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
															
 
																-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
															
 
																-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
															
 
																+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
															
 
																+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
															
 
																 		/* Optimize contiguous case */
															
 
																 		return starpu_interface_copy(src, src_offset, src_node,
															
 
																 					     dst, dst_offset, dst_node,
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
																 	{
															
 
																 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
															
 
																 		STARPU_ASSERT(!ret);
															
 
																-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
															
 
																+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																-		if (!_starpu_notify_data_dependencies(handle))
															
 
																+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
															
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																 	}
															
 
																 }
															
@@ -836,6 +836,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 	int sequential_consistency = handle->sequential_consistency;
															
 
																 	if (sequential_consistency && !nowait)
															
 
																 	{
															
 
																+		/* We will acquire it in write mode to catch all dependencies,
															
 
																+		 * but possibly it's not actually initialized. Fake it to avoid
															
 
																+		 getting caught doing it */
															
 
																+		handle->initialized = 1;
															
 
																 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
															
 
																 		/* If sequential consistency is enabled, wait until data is available */
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
																 /* This function must be called after starpu_data_acquire so that the
															
 
																  * application release the data */
															
 
																-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
															
 
																+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
															
 
																 {
															
 
																 	STARPU_ASSERT(handle);
															
 
																+	if (mode == STARPU_RW)
															
 
																+		/* They are equivalent here, and current_mode is never STARPU_RW */
															
 
																+		mode = STARPU_W;
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
															
 
																+			  mode == handle->current_mode ||
															
 
																+			  (mode == STARPU_R &&
															
 
																+			     handle->current_mode == STARPU_W),
															
 
																+		"We only support releasing from W to R");
															
 
																+
															
 
																 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
															
 
																-	_starpu_unlock_post_sync_tasks(handle);
															
 
																+	_starpu_unlock_post_sync_tasks(handle, mode);
															
 
																 	/* The application can now release the rw-lock */
															
 
																 	if (node >= 0)
															
 
																-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
															
 
																+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
															
 
																 	else
															
 
																 	{
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
																 				handle->per_node[i].refcnt--;
															
 
																 		}
															
 
																 		handle->busy_count--;
															
 
																-		if (!_starpu_notify_data_dependencies(handle))
															
 
																+		if (!_starpu_notify_data_dependencies(handle, mode))
															
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																 	}
															
 
																 }
															
 
																-void starpu_data_release(starpu_data_handle_t handle)
															
 
																+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
															
 
																+{
															
 
																+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
															
 
																 {
															
 
																 	int home_node = handle->home_node;
															
 
																 	if (home_node < 0)
															
 
																 		home_node = STARPU_MAIN_RAM;
															
 
																-	starpu_data_release_on_node(handle, home_node);
															
 
																+	starpu_data_release_to_on_node(handle, mode, home_node);
															
 
																+}
															
 
																+
															
 
																+void starpu_data_release(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	starpu_data_release_to(handle, STARPU_NONE);
															
 
																 }
															
 
																 static void _prefetch_data_on_node(void *arg)
															
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 
																 		_starpu_data_acquire_wrapper_finished(wrapper);
															
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																-	if (!_starpu_notify_data_dependencies(handle))
															
 
																+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
																 		/* In case there was a temporary handle (eg. used for reduction), this
															
 
																 		 * handle may have requested to be destroyed when the data is released
															
 
																 		 * */
															
 
																-		if (!_starpu_notify_data_dependencies(handle))
															
 
																+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
															
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																 	}
															
 
																 	else if (!async)
															
@@ -666,6 +686,9 @@ static void _starpu_data_wont_use(void *data)
 
																 void starpu_data_wont_use(starpu_data_handle_t handle)
															
 
																 {
															
 
																+	if (!handle->initialized)
															
 
																+		/* No value atm actually */
															
 
																+		return;
															
 
																 	_STARPU_TRACE_DATA_WONT_USE(handle);
															
 
																 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
															
 
																 }
															
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 
																 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
															
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																-	if (!_starpu_notify_data_dependencies(handle))
															
 
																+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
															
 
																 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																-		_starpu_release_data_on_node(handle, node0, replicate_0);
															
 
																+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																 		handle->refcnt++;
															
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
															
 
																 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																-		_starpu_release_data_on_node(handle, node1, replicate_1);
															
 
																+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
															
 
																 	}
															
 
																 }
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -333,7 +333,7 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
																 	return data;
															
 
																 }
															
 
																-static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+static void handle_papi_event(struct fxt_ev_64 *ev STARPU_ATTRIBUTE_UNUSED, struct starpu_fxt_options *options STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 #ifdef STARPU_PAPI
															
 
																 	int event_code = ev->param[0];
															
@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
																 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
															
 
																 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
															
 
																 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
															
 
																-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
															
 
																+			char str_handle[STARPU_POTI_STR_LEN];
															
 
																+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
															
 
																+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
															
 
																+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
															
 
																 #endif
															
 
																 		}
															
@@ -2636,10 +2638,12 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 
																                char paje_value[STARPU_POTI_STR_LEN];
															
 
																                snprintf(paje_value, sizeof(paje_value), "%u", task);
															
 
																                snprintf(container, sizeof(container), "%sp", options->file_prefix);
															
 
																-               poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
															
 
																+		if (!options->no_events)
															
 
																+			poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
															
 
																 #else
															
 
																-	       fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
															
 
																-               fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
															
 
																+		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
															
 
																+		if (!options->no_events)
															
 
																+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
															
 
																 #endif
															
 
																 	}
															
@@ -2652,9 +2656,9 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 
																 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
															
 
																 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
															
 
																 		if (options->file_rank < 0)
															
 
																-			fprintf(sched_tasks_file, "JobId: %d\n", task);
															
 
																+			fprintf(sched_tasks_file, "JobId: %u\n", task);
															
 
																 		else
															
 
																-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
															
 
																+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
															
 
																 		fprintf(sched_tasks_file, "\n");
															
 
																 	}
															
 
																 }
															
@@ -2681,11 +2685,13 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 
																 		char paje_value[STARPU_POTI_STR_LEN];
															
 
																 		snprintf(paje_value, sizeof(paje_value), "%u", task);
															
 
																 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
															
 
																-		poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
															
 
																+		if (!options->no_events)
															
 
																+			poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
															
 
																 #else
															
 
																 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
															
 
																 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
															
 
																-		fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
															
 
																+		if (!options->no_events)
															
 
																+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
															
 
																 #endif
															
 
																 	}
															
@@ -2701,9 +2707,9 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 
																 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
															
 
																 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
															
 
																 		if (options->file_rank < 0)
															
 
																-			fprintf(sched_tasks_file, "JobId: %d\n", task);
															
 
																+			fprintf(sched_tasks_file, "JobId: %u\n", task);
															
 
																 		else
															
 
																-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
															
 
																+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
															
 
																 		fprintf(sched_tasks_file, "\n");
															
 
																 	}
															
 
																 }
															
@@ -4450,7 +4456,7 @@ void _starpu_fxt_number_events_file_close(void)
 
																 		for (i = 0; i <= FUT_SETUP_CODE; i++)
															
 
																 		{
															
 
																 			if (number_events[i] > 0)
															
 
																-				fprintf(number_events_file, "0x%x\t%lu\n", i, number_events[i]);
															
 
																+				fprintf(number_events_file, "0x%x\t%"PRIu64"\n", i, number_events[i]);
															
 
																 		}
															
 
																 		free(number_events);
															
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 
																 extern int _starpu_poti_extendedSetState;
															
 
																 extern int _starpu_poti_semiExtendedSetState;
															
 
																 extern int _starpu_poti_MemoryEvent;
															
 
																+extern int _starpu_poti_CommLinkStart;
															
 
																 extern int _starpu_poti_MpiLinkStart;
															
 
																 /*
															
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 			char str_mpi_tag[STARPU_POTI_STR_LEN];
															
 
																 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
															
 
																-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
															
 
																+			char str_handle[STARPU_POTI_STR_LEN];
															
 
																+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
															
 
																+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
															
 
																 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
															
 
																 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
															
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 #else
															
 
																 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
															
 
																 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
															
 
																-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
															
 
																+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
															
 
																 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
															
 
																 #endif
															
@@ -354,10 +356,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 				fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
															
 
																 				fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
															
 
																 				if (cur->jobid != -1)
															
 
																-					fprintf(out_comms_file, "SendJobId: %d_%lu\n", src, cur->jobid);
															
 
																+					fprintf(out_comms_file, "SendJobId: %d_%ld\n", src, cur->jobid);
															
 
																 				if (match->jobid != -1)
															
 
																-					fprintf(out_comms_file, "RecvJobId: %d_%lu\n", dst, match->jobid);
															
 
																-				fprintf(out_comms_file, "Size: %ld\n", size);
															
 
																+					fprintf(out_comms_file, "RecvJobId: %d_%ld\n", dst, match->jobid);
															
 
																+				fprintf(out_comms_file, "Size: %lu\n", (unsigned long)size);
															
 
																 				fprintf(out_comms_file, "\n");
															
 
																 			}
															
 
																 		}
															
@@ -372,7 +374,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 	if (nb_wrong_comm_timing == 1)
															
 
																 		_STARPU_MSG("Warning: a communication finished before it started !\n");
															
 
																 	else if (nb_wrong_comm_timing > 1)
															
 
																-		_STARPU_MSG("Warning: %d communications finished before they started !\n", nb_wrong_comm_timing);
															
 
																+		_STARPU_MSG("Warning: %u communications finished before they started !\n", nb_wrong_comm_timing);
															
 
																 }
															
 
																 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -28,6 +28,7 @@
 
																 int _starpu_poti_extendedSetState = -1;
															
 
																 int _starpu_poti_semiExtendedSetState = -1;
															
 
																 int _starpu_poti_MemoryEvent = -1;
															
 
																+int _starpu_poti_CommLinkStart = -1;
															
 
																 int _starpu_poti_MpiLinkStart = -1;
															
 
																 #endif
															
 
																 #endif
															
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
																 						     "SubmitOrder string"
															
 
																 						     );
															
 
																 #ifdef HAVE_POTI_USER_NEWEVENT
															
 
																+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
															
 
																 	if (options->memory_states)
															
 
																 	{
															
 
																 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
															
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
																 							     "Size string",
															
 
																 							     "Dest string");
															
 
																 	}
															
 
																-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
															
 
																+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
															
 
																 #endif
															
 
																 #else
															
 
																 	poti_header(1,1);
															
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
																 	fprintf(file, "%%	StartContainer	string\n");
															
 
																 	fprintf(file, "%%	Key	string\n");
															
 
																 	fprintf(file, "%%	MPITAG	string\n");
															
 
																+	fprintf(file, "%%	Handle	string\n");
															
 
																 	fprintf(file, "%%EndEventDef\n");
															
 
																-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
															
 
																+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
															
 
																 	fprintf(file, "%%	Time	date\n");
															
 
																 	fprintf(file, "%%	Type	string\n");
															
 
																 	fprintf(file, "%%	Container	string\n");
															
 
																 	fprintf(file, "%%	Value	string\n");
															
 
																-	fprintf(file, "%%	EndContainer	string\n");
															
 
																+	fprintf(file, "%%	StartContainer	string\n");
															
 
																 	fprintf(file, "%%	Key	string\n");
															
 
																-	fprintf(file, "%%	MPITAG	string\n");
															
 
																+	fprintf(file, "%%	Handle	string\n");
															
 
																 	fprintf(file, "%%EndEventDef\n");
															
 
																 #endif
															
--- a/src/sched_policies/component_work_stealing.c
+++ b/src/sched_policies/component_work_stealing.c
@@ -30,14 +30,20 @@
 
																 #warning TODO: locality work-stealing
															
 
																 #endif
															
 
																+struct _starpu_component_work_stealing_data_per_worker
															
 
																+{
															
 
																+	struct _starpu_prio_deque fifo;
															
 
																+	unsigned last_pop_child;
															
 
																+};
															
 
																+
															
 
																 struct _starpu_component_work_stealing_data
															
 
																 {
															
 
																 /* keep track of the work performed from the beginning of the algorithm to make
															
 
																  * better decisions about which queue to child when stealing or deferring work
															
 
																  */
															
 
																-	unsigned performed_total, last_pop_child, last_push_child;
															
 
																+	struct _starpu_component_work_stealing_data_per_worker *per_worker;
															
 
																+	unsigned performed_total, last_push_child;
															
 
																-	struct _starpu_prio_deque * fifos;
															
 
																 	starpu_pthread_mutex_t ** mutexes;
															
 
																 	unsigned size;
															
 
																 };
															
@@ -50,16 +56,14 @@ struct _starpu_component_work_stealing_data
 
																 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
															
 
																 {
															
 
																 	struct _starpu_component_work_stealing_data *wsd = component->data;
															
 
																-	STARPU_HG_DISABLE_CHECKING(wsd->last_pop_child);
															
 
																-	unsigned i = wsd->last_pop_child;
															
 
																-	wsd->last_pop_child = (i + 1) % component->nchildren;
															
 
																-	STARPU_HG_ENABLE_CHECKING(wsd->last_pop_child);
															
 
																+	unsigned i = wsd->per_worker[workerid].last_pop_child;
															
 
																+	wsd->per_worker[workerid].last_pop_child = (i + 1) % component->nchildren;
															
 
																 	/* If the worker's queue have no suitable tasks, let's try
															
 
																 	 * the next ones */
															
 
																 	struct starpu_task * task = NULL;
															
 
																 	while (1)
															
 
																 	{
															
 
																-		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
															
 
																+		struct _starpu_prio_deque * fifo = &wsd->per_worker[i].fifo;
															
 
																 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
															
@@ -75,7 +79,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 
																 			break;
															
 
																 		}
															
 
																-		if (i == wsd->last_pop_child)
															
 
																+		if (i == wsd->per_worker[workerid].last_pop_child)
															
 
																 		{
															
 
																 			/* We got back to the first worker,
															
 
																 			 * don't go in infinite loop */
															
@@ -141,17 +145,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 
																 	struct _starpu_component_work_stealing_data * wsd = component->data;
															
 
																 	const double now = starpu_timing_now();
															
 
																 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																-	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
															
 
																+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->per_worker[i].fifo);
															
 
																 	if(task)
															
 
																 	{
															
 
																 		if(!isnan(task->predicted))
															
 
																 		{
															
 
																-			wsd->fifos[i].exp_len -= task->predicted;
															
 
																-			wsd->fifos[i].exp_start = now + task->predicted;
															
 
																+			wsd->per_worker[i].fifo.exp_len -= task->predicted;
															
 
																+			wsd->per_worker[i].fifo.exp_start = now + task->predicted;
															
 
																 		}
															
 
																 	}
															
 
																 	else
															
 
																-		wsd->fifos[i].exp_len = 0.0;
															
 
																+		wsd->per_worker[i].fifo.exp_len = 0.0;
															
 
																 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 	if(task)
															
@@ -163,7 +167,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 
																 	if(task)
															
 
																 	{
															
 
																 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																-		wsd->fifos[i].nprocessed++;
															
 
																+		wsd->per_worker[i].fifo.nprocessed++;
															
 
																 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 		return task;
															
@@ -196,9 +200,9 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 
																 	for(i = 0; i < component->nchildren; i++)
															
 
																 	{
															
 
																 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																-		sum_len += wsd->fifos[i].exp_len;
															
 
																-		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
															
 
																-		sum_start += wsd->fifos[i].exp_start;
															
 
																+		sum_len += wsd->per_worker[i].fifo.exp_len;
															
 
																+		wsd->per_worker[i].fifo.exp_start = STARPU_MAX(now, wsd->per_worker[i].fifo.exp_start);
															
 
																+		sum_start += wsd->per_worker[i].fifo.exp_start;
															
 
																 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 	}
															
@@ -216,7 +220,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 
																 	for(i = 0; i < component->nchildren; i++)
															
 
																 	{
															
 
																 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																-		ntasks += wsd->fifos[i].ntasks;
															
 
																+		ntasks += wsd->per_worker[i].fifo.ntasks;
															
 
																 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 	}
															
 
																 	double speedup = 0.0;
															
@@ -265,7 +269,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
																 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																 	starpu_sched_task_break(task);
															
 
																-	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
															
 
																+	ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo, task);
															
 
																 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 	wsd->last_push_child = i;
															
@@ -308,9 +312,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
																 			struct _starpu_component_work_stealing_data * wsd = component->data;
															
 
																 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
															
 
																-			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
															
 
																+			int ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo , task);
															
 
																 			if(ret == 0 && !isnan(task->predicted))
															
 
																-				wsd->fifos[i].exp_len += task->predicted;
															
 
																+				wsd->per_worker[i].fifo.exp_len += task->predicted;
															
 
																 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
															
 
																 			component->can_pull(component);
															
@@ -329,12 +333,13 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 
																 	if(wsd->size < component->nchildren)
															
 
																 	{
															
 
																 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
															
 
																-		_STARPU_REALLOC(wsd->fifos, component->nchildren * sizeof(*wsd->fifos));
															
 
																+		_STARPU_REALLOC(wsd->per_worker, component->nchildren * sizeof(*wsd->per_worker));
															
 
																 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
															
 
																 		wsd->size = component->nchildren;
															
 
																 	}
															
 
																-	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
															
 
																+	wsd->per_worker[component->nchildren - 1].last_pop_child = 0;
															
 
																+	_starpu_prio_deque_init(&wsd->per_worker[component->nchildren - 1].fifo);
															
 
																 	starpu_pthread_mutex_t *mutex;
															
 
																 	_STARPU_MALLOC(mutex, sizeof(*mutex));
															
@@ -356,8 +361,8 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 
																 			break;
															
 
																 	}
															
 
																 	STARPU_ASSERT(i_component != component->nchildren);
															
 
																-	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
															
 
																-	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
															
 
																+	struct _starpu_prio_deque tmp_fifo = wsd->per_worker[i_component].fifo;
															
 
																+	wsd->per_worker[i_component].fifo = wsd->per_worker[component->nchildren - 1].fifo;
															
 
																 	component->children[i_component] = component->children[component->nchildren - 1];
															
@@ -372,7 +377,7 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 
																 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
															
 
																 {
															
 
																 	struct _starpu_component_work_stealing_data * wsd = component->data;
															
 
																-	free(wsd->fifos);
															
 
																+	free(wsd->per_worker);
															
 
																 	free(wsd->mutexes);
															
 
																 	free(wsd);
															
 
																 }
															
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -71,10 +71,21 @@ struct locality_entry
 
																 struct _starpu_work_stealing_data_per_worker
															
 
																 {
															
 
																+	char fill1[STARPU_CACHELINE_SIZE];
															
 
																+	/* This is read-mostly, only updated when the queue becomes empty or
															
 
																+	 * becomes non-empty, to make it generally cheap to check */
															
 
																+	unsigned notask;	/* whether the queue is empty */
															
 
																+	char fill2[STARPU_CACHELINE_SIZE];
															
 
																+
															
 
																 	struct _starpu_prio_deque queue;
															
 
																 	int running;
															
 
																 	int *proxlist;
															
 
																-	int busy;
															
 
																+	int busy;	/* Whether this worker is working on a task */
															
 
																+
															
 
																+	/* keep track of the work performed from the beginning of the algorithm to make
															
 
																+	 * better decisions about which queue to select when deferring work
															
 
																+	 */
															
 
																+	unsigned last_pop_worker;
															
 
																 #ifdef USE_LOCALITY_TASKS
															
 
																 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
															
@@ -93,9 +104,8 @@ struct _starpu_work_stealing_data
 
																 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
															
 
																 	struct _starpu_work_stealing_data_per_worker *per_worker;
															
 
																 	/* keep track of the work performed from the beginning of the algorithm to make
															
 
																-	 * better decisions about which queue to select when stealing or deferring work
															
 
																+	 * better decisions about which queue to select when deferring work
															
 
																 	 */
															
 
																-	unsigned last_pop_worker;
															
 
																 	unsigned last_push_worker;
															
 
																 };
															
@@ -118,7 +128,8 @@ static int calibration_value = 0;
 
																  */
															
 
																 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	unsigned worker = ws->last_pop_worker;
															
 
																+	unsigned workerid = starpu_worker_get_id_check();
															
 
																+	unsigned worker = ws->per_worker[workerid].last_pop_worker;
															
 
																 	unsigned nworkers;
															
 
																 	int *workerids = NULL;
															
 
																 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
															
@@ -131,14 +142,17 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 
																 		/* Here helgrind would shout that this is unprotected, but we
															
 
																 		 * are fine with getting outdated values, this is just an
															
 
																 		 * estimation */
															
 
																-		ntasks = ws->per_worker[workerids[worker]].queue.ntasks;
															
 
																-
															
 
																-		if (ntasks && (ws->per_worker[workerids[worker]].busy
															
 
																-					   || starpu_worker_is_blocked_in_parallel(workerids[worker])))
															
 
																-			break;
															
 
																+		if (!ws->per_worker[workerids[worker]].notask)
															
 
																+		{
															
 
																+			if (ws->per_worker[workerids[worker]].busy
															
 
																+						   || starpu_worker_is_blocked_in_parallel(workerids[worker])) {
															
 
																+				ntasks = 1;
															
 
																+				break;
															
 
																+			}
															
 
																+		}
															
 
																 		worker = (worker + 1) % nworkers;
															
 
																-		if (worker == ws->last_pop_worker)
															
 
																+		if (worker == ws->per_worker[workerid].last_pop_worker)
															
 
																 		{
															
 
																 			/* We got back to the first worker,
															
 
																 			 * don't go in infinite loop */
															
@@ -147,7 +161,7 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 
																 		}
															
 
																 	}
															
 
																-	ws->last_pop_worker = (worker + 1) % nworkers;
															
 
																+	ws->per_worker[workerid].last_pop_worker = (worker + 1) % nworkers;
															
 
																 	worker = workerids[worker];
															
@@ -327,15 +341,31 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 
																 	{
															
 
																 		/* found an interesting task, try to pick it! */
															
 
																 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
															
 
																+		{
															
 
																+			if (!data_source->queue.ntasks)
															
 
																+			{
															
 
																+				STARPU_ASSERT(ws->per_worker[source].notask == 0);
															
 
																+				ws->per_worker[source].notask = 1;
															
 
																+			}
															
 
																 			return best_task;
															
 
																+		}
															
 
																 	}
															
 
																 	/* Didn't find an interesting task, or couldn't run it :( */
															
 
																 	int skipped;
															
 
																+	struct starpu_task *task;
															
 
																+
															
 
																 	if (source != target)
															
 
																-		return _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
															
 
																+		task = _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
															
 
																 	else
															
 
																-		return _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
															
 
																+		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
															
 
																+
															
 
																+	if (task && !data_source->queue.ntasks)
															
 
																+	{
															
 
																+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
															
 
																+		ws->per_worker[source].notask = 1;
															
 
																+	}
															
 
																+	return task;
															
 
																 }
															
 
																 /* Called when popping a task from a queue */
															
@@ -371,10 +401,18 @@ static void locality_pushed_task(struct _starpu_work_stealing_data *ws STARPU_AT
 
																 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
															
 
																 {
															
 
																 	int skipped;
															
 
																+	struct starpu_task *task;
															
 
																 	if (source != target)
															
 
																-		return _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
															
 
																+		task = _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
															
 
																 	else
															
 
																-		return _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
															
 
																+		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
															
 
																+
															
 
																+	if (task && !ws->per_worker[source].queue.ntasks)
															
 
																+	{
															
 
																+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
															
 
																+		ws->per_worker[source].notask = 1;
															
 
																+	}
															
 
																+	return task;
															
 
																 }
															
 
																 /* Called when popping a task from a queue */
															
 
																 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
															
@@ -530,7 +568,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
																 	struct starpu_task *task = NULL;
															
 
																 	unsigned workerid = starpu_worker_get_id_check();
															
 
																-	ws->per_worker[workerid].busy = 0;
															
 
																+	if (ws->per_worker[workerid].busy)
															
 
																+		ws->per_worker[workerid].busy = 0;
															
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
															
@@ -617,7 +656,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
																 		if (!task)
															
 
																 			return NULL;
															
 
																 	}
															
 
																-	ws->per_worker[workerid].busy = !!task;
															
 
																+	if (ws->per_worker[workerid].busy != !!task)
															
 
																+		ws->per_worker[workerid].busy = !!task;
															
 
																 	return task;
															
 
																 }
															
@@ -648,6 +688,11 @@ int ws_push_task(struct starpu_task *task)
 
																 	record_data_locality(task, workerid);
															
 
																 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
															
 
																 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
															
 
																+	if (ws->per_worker[workerid].queue.ntasks == 1)
															
 
																+	{
															
 
																+		STARPU_ASSERT(ws->per_worker[workerid].notask == 1);
															
 
																+		ws->per_worker[workerid].notask = 0;
															
 
																+	}
															
 
																 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
															
 
																 	starpu_push_task_end(task);
															
@@ -676,10 +721,12 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 
																 		int workerid = workerids[i];
															
 
																 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
															
 
																 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
															
 
																+		ws->per_worker[workerid].notask = 1;
															
 
																 		ws->per_worker[workerid].running = 1;
															
 
																 		/* Tell helgrind that we are fine with getting outdated values,
															
 
																 		 * this is just an estimation */
															
 
																+		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].notask);
															
 
																 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
															
 
																 		ws->per_worker[workerid].busy = 0;
															
 
																 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
															
@@ -708,9 +755,7 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
																 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
															
 
																 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
															
 
																-	ws->last_pop_worker = 0;
															
 
																 	ws->last_push_worker = 0;
															
 
																-	STARPU_HG_DISABLE_CHECKING(ws->last_pop_worker);
															
 
																 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
															
 
																 	ws->select_victim = select_victim;
															
@@ -760,11 +805,12 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 
																 	for (i = 0; i < nworkers; i++)
															
 
																 	{
															
 
																 		int neighbor = ws->per_worker[workerid].proxlist[i];
															
 
																+		if (ws->per_worker[neighbor].notask)
															
 
																+			continue;
															
 
																                 /* FIXME: do not keep looking again and again at some worker
															
 
																                  * which has tasks, but that can't execute on me */
															
 
																-		int ntasks = ws->per_worker[neighbor].queue.ntasks;
															
 
																-		if (ntasks && (ws->per_worker[neighbor].busy
															
 
																-					   || starpu_worker_is_blocked_in_parallel(neighbor)))
															
 
																+		if (ws->per_worker[neighbor].busy
															
 
																+					   || starpu_worker_is_blocked_in_parallel(neighbor))
															
 
																 			return neighbor;
															
 
																 	}
															
 
																 	return -1;
															
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c
@@ -107,8 +107,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 
																 	unsigned nworkers = starpu_worker_get_count();
															
 
																 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
															
 
																-	/* This method only work on CPU, CUDA, OPENCL */
															
 
																-	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
															
 
																+	STARPU_ASSERT_MSG((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0, "This function is implemented only on CPU, CUDA, OpenCL");
															
 
																 	/* create a wrapper codelet */
															
 
																 	struct starpu_codelet wrapper_cl =
															
--- a/src/util/starpu_data_cpy.c
+++ b/src/util/starpu_data_cpy.c
@@ -159,6 +159,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 
																 	task->callback_func = callback_func;
															
 
																 	task->callback_arg = callback_arg;
															
 
																+	/* FIXME: priority!! */
															
 
																 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
															
 
																 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -43,6 +43,7 @@ EXTRA_DIST =					\
 
																 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
															
 
																 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
															
 
																 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
															
 
																+	microbenchs/bandwidth_scheds.sh		\
															
 
																 	energy/static.sh			\
															
 
																 	energy/dynamic.sh			\
															
 
																 	energy/perfs.gp				\
															
@@ -73,7 +74,7 @@ EXTRA_DIST =					\
 
																 	model-checking/starpu-mc.sh.in
															
 
																 CLEANFILES = 					\
															
 
																-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90
															
 
																+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat bandwidth.gp bandwidth.eps bandwidth.svg
															
 
																 BUILT_SOURCES =
															
 
																 SUBDIRS =
															
@@ -268,6 +269,7 @@ myPROGRAMS +=				\
 
																 	datawizard/acquire_cb_insert		\
															
 
																 	datawizard/acquire_release		\
															
 
																 	datawizard/acquire_release2		\
															
 
																+	datawizard/acquire_release_to		\
															
 
																 	datawizard/acquire_try			\
															
 
																 	datawizard/bcsr				\
															
 
																 	datawizard/cache			\
															
@@ -355,6 +357,7 @@ myPROGRAMS +=				\
 
																 	microbenchs/prefetch_data_on_node 	\
															
 
																 	microbenchs/redundant_buffer		\
															
 
																 	microbenchs/matrix_as_vector		\
															
 
																+	microbenchs/bandwidth			\
															
 
																 	overlap/gpu_concurrency			\
															
 
																 	parallel_tasks/explicit_combined_worker	\
															
 
																 	parallel_tasks/parallel_kernels		\
															
@@ -431,6 +434,8 @@ examplebin_SCRIPTS = \
 
																 	microbenchs/tasks_size_overhead.sh
															
 
																 if !STARPU_SIMGRID
															
 
																 if !STARPU_USE_MPI_MASTER_SLAVE
															
 
																+examplebin_PROGRAMS += \
															
 
																+	microbenchs/bandwidth
															
 
																 SHELL_TESTS += \
															
 
																 	microbenchs/tasks_data_overhead.sh \
															
 
																 	microbenchs/sync_tasks_data_overhead.sh \
															
@@ -454,6 +459,7 @@ endif
 
																 if !STARPU_USE_MPI_MASTER_SLAVE
															
 
																 SHELL_TESTS += \
															
 
																 	datawizard/locality.sh \
															
 
																+	microbenchs/bandwidth_scheds.sh \
															
 
																 	overlap/overlap.sh
															
 
																 endif
															
@@ -505,6 +511,17 @@ datawizard_acquire_release2_SOURCES +=		\
 
																 	datawizard/acquire_release_opencl.c
															
 
																 endif
															
 
																+datawizard_acquire_release_to_SOURCES =		\
															
 
																+	datawizard/acquire_release_to.c
															
 
																+if STARPU_USE_CUDA
															
 
																+datawizard_acquire_release_to_SOURCES +=		\
															
 
																+	datawizard/acquire_release_cuda.cu
															
 
																+endif
															
 
																+if STARPU_USE_OPENCL
															
 
																+datawizard_acquire_release_to_SOURCES +=		\
															
 
																+	datawizard/acquire_release_opencl.c
															
 
																+endif
															
 
																+
															
 
																 datawizard_scratch_SOURCES =			\
															
 
																 	datawizard/scratch.c
															
 
																 if STARPU_USE_CUDA
															
--- a/tests/datawizard/acquire_release_to.c
+++ b/tests/datawizard/acquire_release_to.c
@@ -0,0 +1,214 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "../helper.h"
															
 
																+
															
 
																+/*
															
 
																+ * Check that _release_to correctly interacts with tasks working on the same data
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+static unsigned ntasks = 10;
															
 
																+#elif !defined(STARPU_LONG_CHECK)
															
 
																+static unsigned ntasks = 1000;
															
 
																+#else
															
 
																+static unsigned ntasks = 10000;
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void increment_cuda(void *descr[], void *_args);
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+extern void increment_opencl(void *buffers[], void *args);
															
 
																+#endif
															
 
																+
															
 
																+void increment_cpu(void *descr[], void *arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	(*tokenptr)++;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet increment_cl =
															
 
																+{
															
 
																+	.modes = { STARPU_RW },
															
 
																+	.cpu_funcs = {increment_cpu},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {increment_cuda},
															
 
																+	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	.opencl_funcs = {increment_opencl},
															
 
																+	.opencl_flags = {STARPU_OPENCL_ASYNC},
															
 
																+#endif
															
 
																+	.cpu_funcs_name = {"increment_cpu"},
															
 
																+	.nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+void check_cpu(void *descr[], void *arg)
															
 
																+{
															
 
																+	unsigned *val = arg;
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	STARPU_ASSERT(*tokenptr == *val);
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet check_cl =
															
 
																+{
															
 
																+	.modes = { STARPU_R },
															
 
																+	.cpu_funcs = {check_cpu},
															
 
																+	.cpu_funcs_name = {"check_cpu"},
															
 
																+	.nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+unsigned token = 0;
															
 
																+starpu_data_handle_t token_handle;
															
 
																+
															
 
																+static
															
 
																+int increment_token(void)
															
 
																+{
															
 
																+	int ret;
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+	task->cl = &increment_cl;
															
 
																+	task->handles[0] = token_handle;
															
 
																+	ret = starpu_task_submit(task);
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+int check_token(unsigned value)
															
 
																+{
															
 
																+	unsigned *value_p;
															
 
																+	int ret;
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+	task->cl = &check_cl;
															
 
																+	task->handles[0] = token_handle;
															
 
																+	task->cl_arg = value_p = malloc(sizeof(*value_p));
															
 
																+	task->cl_arg_size = sizeof(*value_p);
															
 
																+	task->cl_arg_free = 1;
															
 
																+	*value_p = value;
															
 
																+	ret = starpu_task_submit(task);
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+void callback(void *arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	token++;
															
 
																+	starpu_data_release_to(token_handle, STARPU_W);
															
 
																+	starpu_sleep(0.001);
															
 
																+	starpu_data_release_to(token_handle, STARPU_R);
															
 
																+	starpu_sleep(0.001);
															
 
																+	starpu_data_release(token_handle);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+struct starpu_opencl_program opencl_program;
															
 
																+#endif
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	int ret;
															
 
																+
															
 
																+        ret = starpu_initialize(NULL, &argc, &argv);
															
 
																+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
															
 
																+						  &opencl_program, NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
															
 
																+#endif
															
 
																+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
															
 
																+
															
 
																+        FPRINTF(stderr, "Token: %u\n", token);
															
 
																+
															
 
																+	for(i=0; i<ntasks; i++)
															
 
																+	{
															
 
																+		/* synchronize data in RAM */
															
 
																+                ret = starpu_data_acquire(token_handle, STARPU_RW);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
															
 
																+
															
 
																+                token ++;
															
 
																+
															
 
																+		ret = check_token(4*i+1);
															
 
																+		if (ret == -ENODEV) goto enodev_release;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+		ret = increment_token();
															
 
																+		if (ret == -ENODEV) goto enodev_release;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+		ret = check_token(4*i+2);
															
 
																+		if (ret == -ENODEV) goto enodev_release;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+		starpu_sleep(0.001);
															
 
																+		starpu_data_release_to(token_handle, STARPU_W);
															
 
																+
															
 
																+		starpu_sleep(0.001);
															
 
																+		starpu_data_release_to(token_handle, STARPU_R);
															
 
																+
															
 
																+		starpu_sleep(0.001);
															
 
																+		starpu_data_release(token_handle);
															
 
																+
															
 
																+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
															
 
																+
															
 
																+		ret = check_token(4*i+3);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+		ret = increment_token();
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+		ret = check_token(4*i+4);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_unregister(token_handle);
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																+#endif
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+        FPRINTF(stderr, "Token: %u\n", token);
															
 
																+	if (token == ntasks * 4)
															
 
																+		ret = EXIT_SUCCESS;
															
 
																+	else
															
 
																+		ret = EXIT_FAILURE;
															
 
																+	return ret;
															
 
																+
															
 
																+enodev_release:
															
 
																+	starpu_data_release(token_handle);
															
 
																+enodev:
															
 
																+	starpu_data_unregister(token_handle);
															
 
																+	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																+	/* yes, we do not perform the computation but we did detect that no one
															
 
																+ 	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        ret = starpu_opencl_unload_opencl(&opencl_program);
															
 
																+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
															
 
																+#endif
															
 
																+	starpu_shutdown();
															
 
																+	return STARPU_TEST_SKIPPED;
															
 
																+}
															
--- a/tests/datawizard/bcsr.c
+++ b/tests/datawizard/bcsr.c
@@ -39,20 +39,20 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																-	printf("\nnnz %d elemsize %d\n", nnz, elemsize);
															
 
																+	printf("\nnnz %u elemsize %u\n", nnz, elemsize);
															
 
																 	for (i = 0; i < nrow; i++)
															
 
																 	{
															
 
																 		uint32_t row_start = rowptr[i] - firstentry;
															
 
																 		uint32_t row_end = rowptr[i+1] - firstentry;
															
 
																-		printf("row %d\n", i);
															
 
																+		printf("row %u\n", i);
															
 
																 		for (j = row_start; j < row_end; j++)
															
 
																 		{
															
 
																 			int *block = nzval + j * r*c;
															
 
																-			printf( " column %d\n", colind[j]);
															
 
																+			printf( " column %u\n", colind[j]);
															
 
																 			for (y = 0; y < r; y++)
															
 
																 			{
															
--- a/tests/datawizard/interfaces/block/block_opencl.c
+++ b/tests/datawizard/interfaces/block/block_opencl.c
@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 
																 	}
															
 
																 	{
															
 
																-		size_t global = nx * ny * nz;
															
 
																+		size_t global[3] = {nx, ny, nz};
															
 
																 		err = clEnqueueNDRangeKernel(queue,
															
 
																 					     kernel,
															
 
																-					     1,
															
 
																+					     3,
															
 
																 					     NULL,
															
 
																-					     &global,
															
 
																+					     global,
															
 
																 					     NULL,
															
 
																 					     0,
															
 
																 					     NULL,
															
--- a/tests/datawizard/interfaces/block/block_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/block/block_opencl_kernel.cl
@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 
																 			   int ldy, int ldz,
															
 
																 			   int factor, __global int *err)
															
 
																 {
															
 
																-        const int id = get_global_id(0);
															
 
																-	if (id > 0)
															
 
																+	const int idx = get_global_id(0);
															
 
																+	const int idy = get_global_id(1);
															
 
																+	const int idz = get_global_id(2);
															
 
																+	if (idx >= nx)
															
 
																 		return;
															
 
																+	if (idy >= ny)
															
 
																+		return;
															
 
																+	if (idz >= nz)
															
 
																+		return;
															
 
																+
															
 
																+	int val = idz*ny*nx+idy*nx+idx;
															
 
																+	int i = (idz*ldz)+(idy*ldy)+idx;
															
 
																-	unsigned int i, j, k;
															
 
																-	int val = 0;
															
 
																-	for (k = 0; k < nz; k++)
															
 
																-	{
															
 
																-		for (j = 0; j < ny; j++)
															
 
																-		{
															
 
																-			for (i = 0; i < nx; i++)
															
 
																-			{
															
 
																-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
															
 
																-				{
															
 
																-					*err = 1;
															
 
																-					return;
															
 
																-				}
															
 
																-				else
															
 
																-				{
															
 
																-					block[(k*ldz)+(j*ldy)+i] *= -1;
															
 
																-					val++;
															
 
																-				}
															
 
																-			}
															
 
																-		}
															
 
																-	}
															
 
																+	if (block[i] != factor * val)
															
 
																+		*err = 1;
															
 
																+	else
															
 
																+		block[i] *= -1;
															
 
																 }
															
--- a/tests/datawizard/interfaces/tensor/tensor_interface.c
+++ b/tests/datawizard/interfaces/tensor/tensor_interface.c
@@ -18,7 +18,7 @@
 
																 #include "../test_interfaces.h"
															
 
																 #include "../../../helper.h"
															
 
																-#define NX 16
															
 
																+#define NX 4
															
 
																 #define NY NX
															
 
																 #define NZ NX
															
 
																 #define NT NX
															
--- a/tests/datawizard/interfaces/tensor/tensor_opencl.c
+++ b/tests/datawizard/interfaces/tensor/tensor_opencl.c
@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 
																 	}
															
 
																 	{
															
 
																-                size_t global = 1;
															
 
																+		size_t global[3] = {nx, ny, nz*nt};
															
 
																 		err = clEnqueueNDRangeKernel(queue,
															
 
																 					     kernel,
															
 
																-					     1,
															
 
																+					     3,
															
 
																 					     NULL,
															
 
																-					     &global,
															
 
																+					     global,
															
 
																 					     NULL,
															
 
																 					     0,
															
 
																 					     NULL,
															
--- a/tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
+++ b/tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 
																 			   int ldy, int ldz, int ldt,
															
 
																 			   int factor, __global int *err)
															
 
																 {
															
 
																-        const int id = get_global_id(0);
															
 
																-	if (id > 0)
															
 
																+	const int idx = get_global_id(0);
															
 
																+	const int idy = get_global_id(1);
															
 
																+	const int idz = get_global_id(2) % nz;
															
 
																+	const int idt = get_global_id(2) / nz;
															
 
																+	if (idx >= nx)
															
 
																 		return;
															
 
																+	if (idy >= ny)
															
 
																+		return;
															
 
																+	if (idz >= nz)
															
 
																+		return;
															
 
																+	if (idt >= nt)
															
 
																+		return;
															
 
																+
															
 
																+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
															
 
																+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
															
 
																-	unsigned int i, j, k, l;
															
 
																-	int val = 0;
															
 
																-	for (l = 0; l < nt; l++)
															
 
																-	{
															
 
																-	    for (k = 0; k < nz; k++)
															
 
																-	    {
															
 
																-		for (j = 0; j < ny; j++)
															
 
																-		{
															
 
																-			for (i = 0; i < nx; i++)
															
 
																-			{
															
 
																-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
															
 
																-				{
															
 
																-					*err = 1;
															
 
																-					return;
															
 
																-				}
															
 
																-				else
															
 
																-				{
															
 
																-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
															
 
																-					val++;
															
 
																-				}
															
 
																-			}
															
 
																-		}
															
 
																-	    }
															
 
																-	}
															
 
																+	if (tensor[i] != factor * val)
															
 
																+		*err = 1;
															
 
																+	else
															
 
																+		tensor[i] *= -1;
															
 
																 }
															
--- a/tests/energy/energy_efficiency.c
+++ b/tests/energy/energy_efficiency.c
@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
																 	unsigned N, k, m, n, iter, NITER;
															
 
																 	if (argc < 2)
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+		N = 10;
															
 
																+#else
															
 
																 		N = 40;
															
 
																+#endif
															
 
																 	else
															
 
																 		N = atoi(argv[1]);
															
 
																 	if (argc < 3)
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+		NITER = 3;
															
 
																+#else
															
 
																 		NITER = 10;
															
 
																+#endif
															
 
																 	else
															
 
																 		NITER = atoi(argv[2]);
															
 
																 	if (N == 0)
															
--- a/tests/helper/starpu_data_dup_ro.c
+++ b/tests/helper/starpu_data_dup_ro.c
@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 
																 	ret = EXIT_SUCCESS;
															
 
																 	if (*var != 42)
															
 
																 	{
															
 
																-	     FPRINTF(stderr, "var2 is %d but it should be %d\n", *var, 42);
															
 
																+	     FPRINTF(stderr, "var2 is %u but it should be %d\n", *var, 42);
															
 
																 	     ret = EXIT_FAILURE;
															
 
																 	}
															
 
																 	starpu_data_release(var2_handle);
															
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 
																 	var = starpu_data_get_local_ptr(var3_handle);
															
 
																 	if (*var != 42)
															
 
																 	{
															
 
																-	     FPRINTF(stderr, "var3 is %d but it should be %d\n", *var, 42);
															
 
																+	     FPRINTF(stderr, "var3 is %u but it should be %d\n", *var, 42);
															
 
																 	     ret = EXIT_FAILURE;
															
 
																 	}
															
 
																 	starpu_data_release(var3_handle);
															
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 
																 	var = starpu_data_get_local_ptr(var4_handle);
															
 
																 	if (*var != 42)
															
 
																 	{
															
 
																-	     FPRINTF(stderr, "var4 is %d but it should be %d\n", *var, 42);
															
 
																+	     FPRINTF(stderr, "var4 is %u but it should be %d\n", *var, 42);
															
 
																 	     ret = EXIT_FAILURE;
															
 
																 	}
															
 
																 	starpu_data_release(var4_handle);
															
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 
																 	var = starpu_data_get_local_ptr(var5_handle);
															
 
																 	if (*var != 43)
															
 
																 	{
															
 
																-	     FPRINTF(stderr, "var5 is %d but it should be %d\n", *var, 43);
															
 
																+	     FPRINTF(stderr, "var5 is %u but it should be %d\n", *var, 43);
															
 
																 	     ret = EXIT_FAILURE;
															
 
																 	}
															
 
																 	starpu_data_release(var5_handle);
															
--- a/tests/microbenchs/bandwidth.c
+++ b/tests/microbenchs/bandwidth.c
@@ -0,0 +1,343 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdio.h>
															
 
																+#include <unistd.h>
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "../helper.h"
															
 
																+
															
 
																+/*
															
 
																+ * Measure the memory bandwidth available to kernels depending on the number of
															
 
																+ * kernels and number of idle workers.
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+static size_t size = 1024;
															
 
																+static unsigned cpustep = 4;
															
 
																+#else
															
 
																+/* Must be bigger than available cache size per core, 64MiB should be enough */
															
 
																+static size_t size = 64UL << 20;
															
 
																+static unsigned cpustep = 1;
															
 
																+#endif
															
 
																+
															
 
																+static unsigned noalone = 0;
															
 
																+static unsigned iter = 30;
															
 
																+static unsigned total_ncpus;
															
 
																+static starpu_pthread_barrier_t barrier_begin, barrier_end;
															
 
																+static float *result;
															
 
																+static void **buffers;	/* Indexed by logical core number */
															
 
																+static char padding1[STARPU_CACHELINE_SIZE];
															
 
																+static volatile char finished;
															
 
																+static char padding2[STARPU_CACHELINE_SIZE];
															
 
																+
															
 
																+static unsigned interleave(unsigned i);
															
 
																+
															
 
																+/* Initialize the buffer locally */
															
 
																+void initialize_buffer(void *foo)
															
 
																+{
															
 
																+	unsigned id = starpu_worker_get_id();
															
 
																+#ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
															
 
																+	STARPU_ASSERT(ret == 0);
															
 
																+#else
															
 
																+	buffers[id] = malloc(2*size);
															
 
																+#endif
															
 
																+	memset(buffers[id], 0, 2*size);
															
 
																+}
															
 
																+
															
 
																+/* Actual transfer codelet */
															
 
																+void bw_func(void *descr[], void *arg)
															
 
																+{
															
 
																+	int id = (uintptr_t) arg;
															
 
																+	void *src = buffers[id];
															
 
																+	void *dst = (void*) ((uintptr_t)src + size);
															
 
																+	unsigned i;
															
 
																+	double start, stop;
															
 
																+
															
 
																+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
															
 
																+	start = starpu_timing_now();
															
 
																+	for (i = 0; i < iter; i++)
															
 
																+	{
															
 
																+		memcpy(dst, src, size);
															
 
																+		STARPU_SYNCHRONIZE();
															
 
																+	}
															
 
																+	stop = starpu_timing_now();
															
 
																+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
															
 
																+	finished = 1;
															
 
																+
															
 
																+	result[id] = (size*iter) / (stop - start);
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet bw_codelet =
															
 
																+{
															
 
																+	.cpu_funcs = {bw_func},
															
 
																+	.model = NULL,
															
 
																+	.nbuffers = 0,
															
 
																+};
															
 
																+
															
 
																+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
															
 
																+void nop_func(void *descr[], void *arg)
															
 
																+{
															
 
																+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
															
 
																+	while (!finished)
															
 
																+	{
															
 
																+		unsigned i;
															
 
																+		for (i = 0; i < 1000000; i++)
															
 
																+			STARPU_UYIELD();
															
 
																+		STARPU_SYNCHRONIZE();
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet nop_codelet =
															
 
																+{
															
 
																+	.cpu_funcs = {nop_func},
															
 
																+	.model = NULL,
															
 
																+	.nbuffers = 0,
															
 
																+};
															
 
																+
															
 
																+/* Codelet that waits for completion while aggressively reading the finished variable. */
															
 
																+void sync_func(void *descr[], void *arg)
															
 
																+{
															
 
																+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
															
 
																+	while (!finished)
															
 
																+	{
															
 
																+		STARPU_VALGRIND_YIELD();
															
 
																+		STARPU_SYNCHRONIZE();
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet sync_codelet =
															
 
																+{
															
 
																+	.cpu_funcs = {sync_func},
															
 
																+	.model = NULL,
															
 
																+	.nbuffers = 0,
															
 
																+};
															
 
																+
															
 
																+static void usage(char **argv)
															
 
																+{
															
 
																+	fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
															
 
																+	fprintf(stderr, "\t-n niter\tNumber of iterations\n");
															
 
																+	fprintf(stderr, "\t-s size\tBuffer size in MB\n");
															
 
																+	fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
															
 
																+	fprintf(stderr, "\t-a Do not run the alone test\n");
															
 
																+	exit(EXIT_FAILURE);
															
 
																+}
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int c;
															
 
																+	while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
															
 
																+	switch(c)
															
 
																+	{
															
 
																+		case 'n':
															
 
																+			iter = atoi(optarg);
															
 
																+			break;
															
 
																+		case 's':
															
 
																+			size = (long)atoi(optarg) << 20;
															
 
																+			break;
															
 
																+		case 'c':
															
 
																+			cpustep = atoi(optarg);
															
 
																+			break;
															
 
																+		case 'a':
															
 
																+			noalone = 1;
															
 
																+			break;
															
 
																+		case 'h':
															
 
																+			usage(argv);
															
 
																+			break;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static unsigned interleave(unsigned i)
															
 
																+{
															
 
																+	/* TODO: rather distribute over hierarchy */
															
 
																+	if (total_ncpus > 1)
															
 
																+		return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
															
 
																+	else
															
 
																+		return 0;
															
 
																+}
															
 
																+
															
 
																+enum sleep_type {
															
 
																+	PAUSE,
															
 
																+	NOP,
															
 
																+	SYNC,
															
 
																+	SCHED,
															
 
																+};
															
 
																+
															
 
																+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
															
 
																+{
															
 
																+	int ret;
															
 
																+	unsigned i;
															
 
																+	struct starpu_conf conf;
															
 
																+	float bw;
															
 
																+
															
 
																+	starpu_conf_init(&conf);
															
 
																+	conf.precedence_over_environment_variables = 1;
															
 
																+	conf.ncuda = 0;
															
 
																+	conf.nopencl = 0;
															
 
																+	conf.nmic = 0;
															
 
																+	conf.nmpi_ms = 0;
															
 
																+	conf.ncpus = ncpus;
															
 
																+
															
 
																+	if (intl && sleep == PAUSE)
															
 
																+	{
															
 
																+		conf.use_explicit_workers_bindid = 1;
															
 
																+		for (i = 0; i < ncpus; i++)
															
 
																+			conf.workers_bindid[i] = interleave(i);
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_initialize(&conf, argc, argv);
															
 
																+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	if (sleep == PAUSE || sleep == SCHED)
															
 
																+		/* In these cases we don't have a task on each cpu */
															
 
																+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
															
 
																+	else
															
 
																+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
															
 
																+
															
 
																+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
															
 
																+
															
 
																+	finished = 0;
															
 
																+	for (i = 0; i < ncpus; i++)
															
 
																+		result[i] = NAN;
															
 
																+
															
 
																+	for (i = 0; i < nbusy; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+		task->cl = &bw_codelet;
															
 
																+
															
 
																+		if (intl)
															
 
																+			task->cl_arg = (void*) (uintptr_t) interleave(i);
															
 
																+		else
															
 
																+			task->cl_arg = (void*) (uintptr_t) i;
															
 
																+
															
 
																+		task->execute_on_a_specific_worker = 1;
															
 
																+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
															
 
																+			task->workerid = interleave(i);
															
 
																+		else
															
 
																+			task->workerid = i;
															
 
																+
															
 
																+		ret = starpu_task_submit(task);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+
															
 
																+	if (sleep != PAUSE && sleep != SCHED)
															
 
																+	{
															
 
																+		/* Add waiting tasks */
															
 
																+		for ( ; i < ncpus; i++)
															
 
																+		{
															
 
																+			struct starpu_task *task = starpu_task_create();
															
 
																+			switch (sleep)
															
 
																+			{
															
 
																+			case NOP:
															
 
																+				task->cl = &nop_codelet;
															
 
																+				break;
															
 
																+			case SYNC:
															
 
																+				task->cl = &sync_codelet;
															
 
																+				break;
															
 
																+			default:
															
 
																+				STARPU_ASSERT(0);
															
 
																+			}
															
 
																+			task->execute_on_a_specific_worker = 1;
															
 
																+			task->workerid = interleave(i);
															
 
																+			ret = starpu_task_submit(task);
															
 
																+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	for (bw = 0., i = 0; i < nbusy; i++)
															
 
																+	{
															
 
																+		if (intl)
															
 
																+			bw += result[interleave(i)];
															
 
																+		else
															
 
																+			bw += result[i];
															
 
																+	}
															
 
																+	return bw;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret;
															
 
																+	unsigned n;
															
 
																+	struct starpu_conf conf;
															
 
																+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	starpu_conf_init(&conf);
															
 
																+	conf.precedence_over_environment_variables = 1;
															
 
																+	conf.ncuda = 0;
															
 
																+	conf.nopencl = 0;
															
 
																+	conf.nmic = 0;
															
 
																+	conf.nmpi_ms = 0;
															
 
																+
															
 
																+	ret = starpu_initialize(&conf, &argc, &argv);
															
 
																+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	total_ncpus = starpu_cpu_worker_get_count();
															
 
																+
															
 
																+	buffers = malloc(total_ncpus * sizeof(*buffers));
															
 
																+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	if (total_ncpus == 0)
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+
															
 
																+	result = malloc(total_ncpus * sizeof(result[0]));
															
 
																+
															
 
																+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
															
 
																+	for (n = cpustep; n <= total_ncpus; n += cpustep)
															
 
																+	{
															
 
																+		if (noalone)
															
 
																+		{
															
 
																+			alone = 0.;
															
 
																+			alone_int = 0.;
															
 
																+			alone_int_nop = 0.;
															
 
																+			alone_int_sync = 0.;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
															
 
																+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
															
 
																+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
															
 
																+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
															
 
																+		}
															
 
																+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
															
 
																+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
															
 
																+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
															
 
																+				n,
															
 
																+				alone/1000,
															
 
																+				sched/1000, sched*100/alone,
															
 
																+				alone_int/1000,
															
 
																+				alone_int_nop/1000,
															
 
																+				alone_int_sync/1000,
															
 
																+				sched_int/1000, sched_int*100/alone_int_nop);
															
 
																+		fflush(stdout);
															
 
																+	}
															
 
																+
															
 
																+	free(result);
															
 
																+
															
 
																+	for (n = 0; n < total_ncpus; n++)
															
 
																+		free(buffers[n]);
															
 
																+	free(buffers);
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
--- a/tests/microbenchs/bandwidth_scheds.sh
+++ b/tests/microbenchs/bandwidth_scheds.sh
@@ -0,0 +1,75 @@
 
																+#!/bin/bash
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																+
															
 
																+set -e
															
 
																+
															
 
																+if [ -n "$STARPU_SCHED" ]
															
 
																+then
															
 
																+	SCHEDS=$STARPU_SCHED
															
 
																+	DEFAULT=$STARPU_SCHED
															
 
																+else
															
 
																+	SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
															
 
																+	DEFAULT=eager
															
 
																+fi
															
 
																+
															
 
																+if [ -n "$STARPU_BENCH_DIR" ]; then
															
 
																+	cat > bandwidth.gp << EOF
															
 
																+set term svg font ",12" size 1500,500 linewidth 0.5
															
 
																+set output "bandwidth.svg"
															
 
																+set pointsize 0.3
															
 
																+EOF
															
 
																+else
															
 
																+	fast="-n 3 -c 4"
															
 
																+	cat > bandwidth.gp << EOF
															
 
																+set term postscript eps enhanced color font ",18"
															
 
																+set output "bandwidth.eps"
															
 
																+set size 2,1
															
 
																+EOF
															
 
																+fi
															
 
																+
															
 
																+cat >> bandwidth.gp << EOF
															
 
																+set key outside
															
 
																+set ylabel "GB/s"
															
 
																+set xlabel "ncores"
															
 
																+
															
 
																+plot \\
															
 
																+	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
															
 
																+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
															
 
																+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
															
 
																+	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone contiguous", \\
															
 
																+EOF
															
 
																+
															
 
																+type=1
															
 
																+for sched in $SCHEDS
															
 
																+do
															
 
																+	if [ "$sched" != eager -a "$sched" != "$SCHEDS" ]; then
															
 
																+		extra=-a
															
 
																+	else
															
 
																+		extra=
															
 
																+	fi
															
 
																+
															
 
																+	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $fast $extra "$@" | tee bandwidth-$sched.dat
															
 
																+	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
															
 
																+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
															
 
																+	type=$((type+1))
															
 
																+done
															
 
																+
															
 
																+if gnuplot bandwidth.gp ; then
															
 
																+	if [ -n "$STARPU_BENCH_DIR" ]; then
															
 
																+		cp bandwidth.svg $STARPU_BENCH_DIR/
															
 
																+	fi
															
 
																+fi
															
--- a/tests/overlap/overlap.c
+++ b/tests/overlap/overlap.c
@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 
																 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	}
															
 
																+	starpu_do_schedule();
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	if (!finished)
															
 
																 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
															
--- a/tests/parallel_tasks/explicit_combined_worker.c
+++ b/tests/parallel_tasks/explicit_combined_worker.c
@@ -114,7 +114,7 @@ int main(void)
 
																 enodev:
															
 
																 	starpu_data_unregister(v_handle);
															
 
																 	starpu_free(v);
															
 
																-	fprintf(stderr, "WARNING: No one can execute the task on workerid %d\n", worker);
															
 
																+	fprintf(stderr, "WARNING: No one can execute the task on workerid %u\n", worker);
															
 
																 	/* yes, we do not perform the computation but we did detect that no one
															
 
																  	 * could perform the kernel, so this is not an error from StarPU */
															
 
																 	starpu_shutdown();
															
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -285,6 +285,7 @@ EXTRA_DIST =				\
 
																 	dev/cppcheck/suppressions.txt	\
															
 
																 	dev/valgrind/bash.suppr		\
															
 
																 	dev/valgrind/fxt.suppr		\
															
 
																+	dev/valgrind/glpk.suppr		\
															
 
																 	dev/valgrind/hdf5.suppr		\
															
 
																 	dev/valgrind/hwloc.suppr	\
															
 
																 	dev/valgrind/libc.suppr		\
															
--- a/tools/dev/valgrind/glpk.suppr
+++ b/tools/dev/valgrind/glpk.suppr
@@ -0,0 +1,23 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																+
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   ...
															
 
																+   fun:glp_init_env
															
 
																+}
															
--- a/tools/dev/valgrind/libc.suppr
+++ b/tools/dev/valgrind/libc.suppr
@@ -263,8 +263,7 @@
 
																    Memcheck:Leak
															
 
																    match-leak-kinds: reachable
															
 
																    fun:malloc
															
 
																-   fun:_dl_close_worker
															
 
																-   fun:_dl_close_worker
															
 
																+   ...
															
 
																    fun:_dl_close
															
 
																    fun:_dl_catch_exception
															
 
																    fun:_dl_catch_error
															
--- a/tools/dev/valgrind/padico.suppr
+++ b/tools/dev/valgrind/padico.suppr
@@ -110,7 +110,7 @@
 
																    Memcheck:Leak
															
 
																    match-leak-kinds: reachable
															
 
																    fun:malloc
															
 
																-   fun:_dl_close_worker
															
 
																+   ...
															
 
																    fun:_dl_close
															
 
																    fun:_dl_catch_error
															
 
																    fun:dlerror_run
															
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -39,33 +39,6 @@ define starpu-print-task
 
																   set $task = (struct starpu_task *)$arg0
															
 
																   set $job = (struct _starpu_job *)$task->starpu_private
															
 
																   set $status=0
															
 
																-  if $task->status == 0
															
 
																-    set $status="STARPU_TASK_INIT"
															
 
																-  end
															
 
																-  if $task->status == 1
															
 
																-    set $status="STARPU_TASK_BLOCKED"
															
 
																-  end
															
 
																-  if $task->status == 2
															
 
																-    set $status="STARPU_TASK_READY"
															
 
																-  end
															
 
																-  if $task->status == 3
															
 
																-    set $status="STARPU_TASK_RUNNING"
															
 
																-  end
															
 
																-  if $task->status == 4
															
 
																-    set $status="STARPU_TASK_FINISHED"
															
 
																-  end
															
 
																-  if $task->status == 5
															
 
																-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
															
 
																-  end
															
 
																-  if $task->status == 6
															
 
																-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
															
 
																-  end
															
 
																-  if $task->status == 7
															
 
																-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
															
 
																-  end
															
 
																-  if $task->status == 8
															
 
																-    set $status="STARPU_TASK_STOPPED"
															
 
																-  end
															
 
																   printf "StarPU Task (%p)\n", $task
															
 
																   if $task->name
															
@@ -81,13 +54,42 @@ define starpu-print-task
 
																   end
															
 
																   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
															
 
																   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
															
 
																+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
															
 
																   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
															
 
																   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
															
 
																   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
															
 
																   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
															
 
																   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
															
 
																   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
															
 
																-  printf "\tstatus:\t\t\t\t<%s>\n", $status
															
 
																+  printf "\tstatus:\t\t\t\t"
															
 
																+  if $task->status == 0
															
 
																+    printf "STARPU_TASK_INIT"
															
 
																+  end
															
 
																+  if $task->status == 1
															
 
																+    printf "STARPU_TASK_BLOCKED"
															
 
																+  end
															
 
																+  if $task->status == 2
															
 
																+    printf "STARPU_TASK_READY"
															
 
																+  end
															
 
																+  if $task->status == 3
															
 
																+    printf "STARPU_TASK_RUNNING"
															
 
																+  end
															
 
																+  if $task->status == 4
															
 
																+    printf "STARPU_TASK_FINISHED"
															
 
																+  end
															
 
																+  if $task->status == 5
															
 
																+    printf "STARPU_TASK_BLOCKED_ON_TAG"
															
 
																+  end
															
 
																+  if $task->status == 6
															
 
																+    printf "STARPU_TASK_BLOCKED_ON_TASK"
															
 
																+  end
															
 
																+  if $task->status == 7
															
 
																+    printf "STARPU_TASK_BLOCKED_ON_DATA"
															
 
																+  end
															
 
																+  if $task->status == 8
															
 
																+    printf "STARPU_TASK_STOPPED"
															
 
																+  end
															
 
																+  printf "\n"
															
 
																   printf "\tjob:\t\t\t\t<%p>\n", $job
															
 
																   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
															
 
																   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
															
@@ -169,35 +171,36 @@ define starpu-workers
 
																   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
															
 
																   while $num<_starpu_config->topology->nworkers
															
 
																     set $worker=&_starpu_config->workers[$num]
															
 
																+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
															
 
																+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
															
 
																     if $worker->status == STATUS_INVALID
															
 
																-      set $status="INVALID"
															
 
																+      printf "INVALID"
															
 
																     end
															
 
																     if $worker->status == STATUS_UNKNOWN
															
 
																-      set $status="UNKNOWN"
															
 
																+      printf "UNKNOWN"
															
 
																     end
															
 
																     if $worker->status == STATUS_INITIALIZING
															
 
																-      set $status="INITIALIZING"
															
 
																+      printf "INITIALIZING"
															
 
																     end
															
 
																     if $worker->status == STATUS_EXECUTING
															
 
																-      set $status="EXECUTING"
															
 
																+      printf "EXECUTING"
															
 
																     end
															
 
																     if $worker->status == STATUS_CALLBACK
															
 
																-      set $status="CALLBACK"
															
 
																+      printf "CALLBACK"
															
 
																     end
															
 
																     if $worker->status == STATUS_SCHEDULING
															
 
																-      set $status="SCHEDULING"
															
 
																+      printf "SCHEDULING"
															
 
																     end
															
 
																     if $worker->status == STATUS_WAITING
															
 
																-      set $status="WAITING"
															
 
																+      printf "WAITING"
															
 
																     end
															
 
																     if $worker->status == STATUS_SLEEPING_SCHEDULING
															
 
																-      set $status="SLEEPING_SCHEDULING"
															
 
																+      printf "SLEEPING_SCHEDULING"
															
 
																     end
															
 
																     if $worker->status == STATUS_SLEEPING
															
 
																-      set $status="SLEEPING"
															
 
																+      printf "SLEEPING"
															
 
																     end
															
 
																-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
															
 
																-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
															
 
																+    printf "\n"
															
 
																     set $num = $num + 1
															
 
																   end
															
 
																 end
															
@@ -205,23 +208,24 @@ end
 
																 define starpu-print-tag
															
 
																   set language c
															
 
																   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
															
 
																+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
															
 
																+  printf "\tstate "
															
 
																   if $tag_struct->state == STARPU_INVALID_STATE
															
 
																-     set $status="STARPU_INVALID_STATE"
															
 
																+     printf "STARPU_INVALID_STATE"
															
 
																   end
															
 
																   if $tag_struct->state == STARPU_ASSOCIATED
															
 
																-     set $status="STARPU_ASSOCIATED"
															
 
																+     printf "STARPU_ASSOCIATED"
															
 
																   end
															
 
																   if $tag_struct->state == STARPU_BLOCKED
															
 
																-     set $status="STARPU_BLOCKED"
															
 
																+     printf "STARPU_BLOCKED"
															
 
																   end
															
 
																   if $tag_struct->state == STARPU_READY
															
 
																-     set $status="STARPU_READY"
															
 
																+     printf "STARPU_READY"
															
 
																   end
															
 
																   if $tag_struct->state == STARPU_DONE
															
 
																-     set $status="STARPU_DONE"
															
 
																+     printf "STARPU_DONE"
															
 
																   end
															
 
																-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
															
 
																-  printf "\tstate %s\n", $status
															
 
																+  printf "\n"
															
 
																   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
															
 
																   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
															
 
																   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
															
@@ -317,7 +321,11 @@ define starpu-all-tasks
 
																     while $l != &all_jobs_list
															
 
																       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
															
 
																       set $task = $j->task
															
 
																-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
															
 
																+      if $task->name
															
 
																+        printf "task %p %s\n", $task, $task->name
															
 
																+      else
															
 
																+        printf "task %p\n", $task
															
 
																+      end
															
 
																       set $l = $l->next
															
 
																     end
															
 
																   end
															
@@ -915,9 +923,9 @@ end
 
																 define starpu-sched-print-component
															
 
																     set $c = (struct starpu_sched_component *) $arg1
															
 
																     starpu-print-spaces $arg0
															
 
																-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
															
 
																+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
															
 
																     if $c->push_task == fifo_push_task
															
 
																-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
															
 
																+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
															
 
																       starpu-print-spaces $arg0
															
 
																       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
															
 
																     end
															
@@ -951,29 +959,29 @@ end
 
																 define starpu-mpi-print-request
															
 
																     set $request = (struct _starpu_mpi_req *)$arg0
															
 
																-    set $request_type = "unknown_type"
															
 
																+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
															
 
																     if $request->request_type == SEND_REQ
															
 
																-       set $request_type = "SEND_REQ"
															
 
																+       printf "SEND_REQ"
															
 
																     end
															
 
																     if $request->request_type == RECV_REQ
															
 
																-       set $request_type = "RECV_REQ"
															
 
																+       printf "RECV_REQ"
															
 
																     end
															
 
																     if $request->request_type == WAIT_REQ
															
 
																-       set $request_type = "WAIT_REQ"
															
 
																+       printf "WAIT_REQ"
															
 
																     end
															
 
																     if $request->request_type == TEST_REQ
															
 
																-       set $request_type = "TEST_REQ"
															
 
																+       printf "TEST_REQ"
															
 
																     end
															
 
																     if $request->request_type == BARRIER_REQ
															
 
																-       set $request_type = "BARRIER_REQ"
															
 
																+       printf "BARRIER_REQ"
															
 
																     end
															
 
																     if $request->request_type == PROBE_REQ
															
 
																-       set $request_type = "PROBE_REQ"
															
 
																+       printf "PROBE_REQ"
															
 
																     end
															
 
																     if $request->request_type == UNKNOWN_REQ
															
 
																-       set $request_type = "UNKNOWN_REQ"
															
 
																+       printf "UNKNOWN_REQ"
															
 
																     end
															
 
																-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
															
 
																+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
															
 
																 end
															
 
																 define starpu-mpi-print-ready-recv-requests
															
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
 
																     end
															
 
																 end
															
 
																+define starpu-mpi-print-requests-list
															
 
																+  set $list = $arg0
															
 
																+  set $request = $list->_head
															
 
																+  while $request
															
 
																+    starpu-mpi-print-request $request
															
 
																+    set $request = $request->_next
															
 
																+  end
															
 
																+end
															
 
																+
															
 
																+define starpu-mpi-print-requests-tree
															
 
																+  if $arg0
															
 
																+    starpu-mpi-print-requests-tree $arg0->children[0]
															
 
																+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
															
 
																+    starpu-mpi-print-requests-list (&($stage->list))
															
 
																+    starpu-mpi-print-requests-tree $arg0->children[1]
															
 
																+  end
															
 
																+end
															
 
																+
															
 
																 define starpu-mpi-print-ready-send-requests
															
 
																-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
															
 
																-    if $list
															
 
																-	set $request = $list.list._head
															
 
																-        while $request
															
 
																-            starpu-mpi-print-request $request
															
 
																-	    set $request = $request->_next
															
 
																-	end
															
 
																+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
															
 
																+  if _starpu_debug
															
 
																+    if $prio_list
															
 
																+        starpu-mpi-print-requests-list &$prio_list.list
															
 
																+    else
															
 
																+	printf "No ready send requests\n"
															
 
																+    end
															
 
																+  else
															
 
																+    if $prio_list.empty == 0
															
 
																+        starpu-mpi-print-requests-tree $prio_list.tree.root
															
 
																     else
															
 
																 	printf "No ready send requests\n"
															
 
																     end
															
 
																+  end
															
 
																 end
															
 
																 define starpu-mpi-print-detached-requests
															
--- a/tools/starpu_fxt_tool.c
+++ b/tools/starpu_fxt_tool.c
@@ -81,7 +81,7 @@ static int parse_args(int argc, char **argv)
 
																 		{
															
 
																 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
															
 
																 			{
															
 
																-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
															
 
																+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
															
 
																 				return 7;
															
 
																 			}
															
 
																 			options.filenames[options.ninputfiles++] = argv[++i];
															
@@ -179,7 +179,7 @@ static int parse_args(int argc, char **argv)
 
																 		{
															
 
																 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
															
 
																 			{
															
 
																-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
															
 
																+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
															
 
																 				return 7;
															
 
																 			}
															
 
																 			options.filenames[options.ninputfiles++] = argv[i];
															
--- a/tools/starpu_perfmodel_recdump.c
+++ b/tools/starpu_perfmodel_recdump.c