4 年之前 · 83bc792574
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,7 +30,7 @@ build:
 
				       when: never  # Prevent pipeline run for push event
			
 
				     - when: always # Run pipeline for all other cases
			
 
				 
			
 
				-deploy:
			
 
				+check:
			
 
				   stage: deploy
			
 
				   script:
			
 
				     - ./contrib/gitlab/deploy.sh
			
@@ -38,3 +38,12 @@ deploy:
 
				     - if: '$CI_PIPELINE_SOURCE == "push"'
			
 
				       when: never  # Prevent pipeline run for push event
			
 
				     - when: always # Run pipeline for all other cases
			
 
				+
			
 
				+simgrid:
			
 
				+  stage: deploy
			
 
				+  script:
			
 
				+    - ./contrib/gitlab/simgrid.sh
			
 
				+  rules:
			
 
				+    - if: '$CI_PIPELINE_SOURCE == "push"'
			
 
				+      when: never  # Prevent pipeline run for push event
			
 
				+    - when: always # Run pipeline for all other cases
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,6 +17,7 @@ Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 
				 He Kun, Inria, <kun.he@inria.fr>
			
 
				 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
			
 
				 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
			
 
				+Jego Antoine, Enseeiht, <antoine.jego@etu.enseeiht.fr>
			
 
				 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
			
 
				 Juven Alexis, Inria, <alexis.juven@inria.fr>
			
 
				 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -51,9 +51,11 @@ New features:
 
				     starpu_mpi_interface_datatype_node_register which will be needed for
			
 
				     MPI/NUMA/GPUDirect.
			
 
				   * Add peek_data interface method.
			
 
				+  * Add STARPU_MPI_REDUX
			
 
				 
			
 
				 Small changes:
			
 
				   * Add a synthetic energy efficiency testcase.
			
 
				+  * Make reduction methods want the commute flag.
			
 
				 
			
 
				 StarPU 1.3.8
			
 
				 ====================================================================
			
@@ -67,6 +69,7 @@ Small features:
 
				     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
			
 
				     instead of hyperthreads.
			
 
				   * New STARPU_TASK_PROGRESS environment variable to show task progression.
			
 
				+  * Add STARPU_SIMGRID environment variable guard against native builds.
			
 
				 
			
 
				 StarPU 1.3.7
			
 
				 ====================================================================
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -53,9 +53,11 @@ if STARPU_BUILD_STARPURM
 
				 SUBDIRS += starpurm
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_CPU
			
 
				 if STARPU_BUILD_STARPUPY
			
 
				 SUBDIRS += starpupy
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 if STARPU_BUILD_SC_HYPERVISOR
			
 
				 SUBDIRS += sc_hypervisor
			
--- a/configure.ac
+++ b/configure.ac
@@ -167,9 +167,8 @@ if test x$enable_simgrid = xyes ; then
 
				 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
			
 
				 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
			
 
				 	fi
			
 
				-	if test -n "$SIMGRID_LIBS" ; then
			
 
				-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
			
 
				-	fi
			
 
				+	SAVED_LIBS="${LIBS}"
			
 
				+	LIBS="$SIMGRID_LIBS $LIBS"
			
 
				 	AC_HAVE_LIBRARY([simgrid], [],
			
 
				 		[
			
 
				 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
			
@@ -207,6 +206,7 @@ if test x$enable_simgrid = xyes ; then
 
				 
			
 
				 	# Oldies for compatibility with older simgrid
			
 
				 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
			
 
				+	LIBS="${SAVED_LIBS}"
			
 
				 
			
 
				 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
			
 
				 	# We won't bind or detect anything
			
@@ -225,6 +225,7 @@ if test x$enable_simgrid = xyes ; then
 
				 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
			
 
				 		LIBS="$LIBS -lstdc++"
			
 
				 	fi
			
 
				+	SIMGRID_LDFLAGS="$SIMGRID_LIBS -lsimgrid"
			
 
				 
			
 
				 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
			
 
				 	case \ $CXXFLAGS\  in
			
@@ -267,13 +268,13 @@ if test x$enable_simgrid = xyes ; then
 
				 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
			
 
				 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
			
 
				 		# libsimgrid needs to be linked from binaries themselves for MC to work
			
 
				-		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
			
 
				+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $SIMGRID_LDFLAGS"
			
 
				 	fi
			
 
				 fi
			
 
				 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
			
 
				 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
			
 
				 AC_SUBST(SIMGRID_CFLAGS)
			
 
				-AC_SUBST(SIMGRID_LIBS)
			
 
				+AC_SUBST(SIMGRID_LDFLAGS)
			
 
				 AC_MSG_CHECKING(whether SimGrid is enabled)
			
 
				 AC_MSG_RESULT($enable_simgrid)
			
 
				 
			
@@ -2304,9 +2305,6 @@ if test x$maxnodes = x0 ; then
 
				 	if test x$enable_simgrid = xyes ; then
			
 
				 		# We need the room for the virtual CUDA/OpenCL devices
			
 
				 		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
			
 
				-		if test $nodes -gt 32 ; then
			
 
				-			nodes=32
			
 
				-		fi
			
 
				 	else
			
 
				 		# We have one memory node shared by all CPU workers, one node per GPU
			
 
				 		# and per MIC device
			
@@ -2342,8 +2340,7 @@ if test x$maxnodes = x0 ; then
 
				 	done
			
 
				 fi
			
 
				 if test $maxnodes -gt 32 ; then
			
 
				-	# FIXME: at least use uint64 so we can have 64 memory nodes
			
 
				-	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
			
 
				+	AC_MSG_WARN([Note: the wt_mask feature only supports 32 memory nodes])
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(maximum number of memory nodes)
			
@@ -3448,6 +3445,14 @@ then
 
				 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
			
 
				 	fi
			
 
				 	AC_SUBST(PYTHON)
			
 
				+	PYTHON_INCLUDE_DIRS="`$PYTHON -c "from sysconfig import get_paths as gp; print(gp()@<:@'include'@:>@)"`"
			
 
				+	SAVED_CPPFLAGS="${CPPFLAGS}"
			
 
				+	CPPFLAGS="$CPPFLAGS -I$PYTHON_INCLUDE_DIRS"
			
 
				+	AC_CHECK_HEADERS([Python.h],[have_python_h=yes],[have_python_h=no])
			
 
				+	if test "$have_python_h" = "no" ; then
			
 
				+		AC_MSG_ERROR([Python.h missing, cannot build StarPU python interface (consider installing python-dev)])
			
 
				+	fi
			
 
				+	CPPFLAGS=${SAVED_CPPFLAGS}
			
 
				 	AC_MSG_CHECKING(for python3 module joblib)
			
 
				 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
			
 
				 	AC_MSG_RESULT($joblib_avail)
			
@@ -3565,7 +3570,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 
				 AC_SUBST([STARPU_H_CPPFLAGS])
			
 
				 
			
 
				 # these are the flags needed for linking libstarpu (and thus also for static linking)
			
 
				-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
			
 
				+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
			
 
				 AC_SUBST([LIBSTARPU_LDFLAGS])
			
 
				 
			
 
				 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
			
@@ -3805,11 +3810,11 @@ AC_MSG_NOTICE([
 
				 	       OpenMP runtime support enabled:                $enable_openmp
			
 
				 	       Cluster support enabled:                       $enable_cluster
			
 
				 	       SOCL enabled:                                  $build_socl
			
 
				-               SOCL test suite:                               $run_socl_check
			
 
				-               Scheduler Hypervisor:                          $build_sc_hypervisor
			
 
				-               simgrid enabled:                               $enable_simgrid
			
 
				-               ayudame enabled:                               $ayu_msg
			
 
				-               HDF5 enabled:                                  $enable_hdf5
			
 
				+	       SOCL test suite:                               $run_socl_check
			
 
				+	       Scheduler Hypervisor:                          $build_sc_hypervisor
			
 
				+	       simgrid enabled:                               $enable_simgrid
			
 
				+	       ayudame enabled:                               $ayu_msg
			
 
				+	       HDF5 enabled:                                  $enable_hdf5
			
 
				 	       Native fortran support:                        $enable_build_fortran
			
 
				 	       Native MPI fortran support:                    $use_mpi_fort
			
 
				 	       Support for multiple linear regression models: $support_mlr
			
--- a/contrib/ci.inria.fr/job-1-check.sh
+++ b/contrib/ci.inria.fr/job-1-check.sh
@@ -37,7 +37,11 @@ basename=$(basename $tarball .tar.gz)
 
				 export STARPU_HOME=$PWD/$basename/home
			
 
				 mkdir -p $basename
			
 
				 cd $basename
			
 
				-env > $PWD/env
			
 
				+(
			
 
				+    echo "oldPWD=\${PWD}"
			
 
				+    env|grep -v LS_COLORS | grep '^[A-Z]'|grep -v BASH_FUNC | grep '=' | sed 's/=/=\"/'| sed 's/$/\"/' | sed 's/^/export /'
			
 
				+    echo "cd \$oldPWD"
			
 
				+) > ${PWD}/env
			
 
				 
			
 
				 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
			
 
				 tar xfz ../$tarball
			
@@ -63,7 +67,17 @@ fi
 
				 
			
 
				 export CC=gcc
			
 
				 
			
 
				-CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
			
 
				+set +e
			
 
				+mpiexec -oversubscribe pwd 2>/dev/null
			
 
				+ret=$?
			
 
				+set -e
			
 
				+ARGS=""
			
 
				+if test "$ret" = "0"
			
 
				+then
			
 
				+    ARGS="--with-mpiexec-args=-oversubscribe"
			
 
				+fi
			
 
				+
			
 
				+CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc $ARGS"
			
 
				 CONFIGURE_CHECK=""
			
 
				 day=$(date +%u)
			
 
				 if test $day -le 5
			
@@ -72,10 +86,11 @@ then
 
				 #else
			
 
				     # we do a normal check, a long check takes too long on VM nodes
			
 
				 fi
			
 
				-../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
			
 
				+../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS $STARPU_USER_CONFIGURE_OPTIONS
			
 
				 
			
 
				 export STARPU_TIMEOUT_ENV=1800
			
 
				 export MPIEXEC_TIMEOUT=1800
			
 
				+
			
 
				 make
			
 
				 #make check
			
 
				 (make -k check || true) 2>&1 | tee  ../check_$$
			
--- a/contrib/gitlab/simgrid.sh
+++ b/contrib/gitlab/simgrid.sh
@@ -0,0 +1,22 @@
 
				+#!/bin/sh
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+STARPU_USER_CONFIGURE_OPTIONS="--enable-simgrid --disable-mpi --disable-mpi-check" ./contrib/ci.inria.fr/job-1-check.sh
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -520,7 +520,7 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 
				 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
			
 
				 using the SimGrid version of StarPU: first install the SimGrid simulator from
			
 
				 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
			
 
				-3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
			
 
				+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with \c -Denable_msg=ON.
			
 
				 Other versions may have compatibility issues, 3.17 notably does
			
 
				 not build at all. MPI simulation does not work with version 3.22).
			
 
				 Then configure StarPU with \ref enable-simgrid
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -643,7 +643,8 @@ struct starpu_codelet accumulate_variable_cl =
 
				         .cpu_funcs = { accumulate_variable_cpu },
			
 
				         .cpu_funcs_name = { "accumulate_variable_cpu" },
			
 
				         .cuda_funcs = { accumulate_variable_cuda },
			
 
				-        .nbuffers = 1,
			
 
				+        .nbuffers = 2,
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 }
			
 
				 \endcode
			
 
				 
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -515,12 +515,12 @@ The <c>-f</c> option can also be used to display the performance in terms of GFl
 
				 
			
 
				 \verbatim
			
 
				 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
			
 
				-$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
			
 
				-$ gv starpu_non_linear_memset_regression_based_energy.eps
			
 
				+$ gnuplot starpu_gflops_non_linear_memset_regression_based_energy.gp
			
 
				+$ gv starpu_gflops_non_linear_memset_regression_based_energy.eps
			
 
				 \endverbatim
			
 
				 
			
 
				-\image html starpu_non_linear_memset_regression_based_energy_flops.png
			
 
				-\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
			
 
				+\image html starpu_gflops_non_linear_memset_regression_based_energy.png
			
 
				+\image latex starpu_gflops_non_linear_memset_regression_based_energy.eps "" width=\textwidth
			
 
				 
			
 
				 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
			
 
				 
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -744,6 +744,37 @@ starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
 
				 
			
 
				 The data can then be used just like pernode above.
			
 
				 
			
 
				+\section MPIMpiRedux Inter-node reduction
			
 
				+
			
 
				+One might want to leverage a reduction pattern across several nodes.
			
 
				+Using \c STARPU_REDUX, one can obtain reduction patterns across several nodes,
			
 
				+however each core across the contributing nodes will spawn their own
			
 
				+contribution to work with. In the case that these allocations or the
			
 
				+required reductions are too expensive to execute for each contribution,
			
 
				+the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution 
			
 
				+on node executing tasks partaking in the reduction.
			
 
				+
			
 
				+Tasks producing a result in the inter-node reduction should be registered as
			
 
				+accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode.
			
 
				+
			
 
				+\code{.c}
			
 
				+static struct starpu_codelet contrib_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_contrib}, /* cpu implementation(s) of the routine */
			
 
				+	.nbuffers = 1, /* number of data handles referenced by this routine */
			
 
				+	.modes = {STARPU_RW | STARPU_COMMUTE} /* access modes for the contribution */
			
 
				+	.name = "contribution"
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+When inserting these tasks, the access mode handed out to the StarPU-MPI layer
			
 
				+should be \c STARPU_MPI_REDUX. Assuming \c data is owned by node 0 and we want node
			
 
				+1 to compute the contribution, we could do the following.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, EXECUTE_ON_NODE, 1); /* Node 1 computes it */
			
 
				+\endcode
			
 
				+
			
 
				 \section MPIPriorities Priorities
			
 
				 
			
 
				 All send functions have a <c>_prio</c> variant which takes an additional
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -473,6 +473,16 @@ todo
 
				 todo
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+Specify if CUDA workers should do only fast allocations
			
 
				+when running the datawizard progress of
			
 
				+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				+Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
			
@@ -738,6 +748,27 @@ block when the memory allocation required for network reception overflows the
 
				 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_MPI_EARLYDATA_ALLOCATE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_MPI_EARLYDATA_ALLOCATE
			
 
				+\addindex __env__STARPU_MPI_EARLYDATA_ALLOCATE
			
 
				+When set to 1, the MPI Driver will immediately allocate the data for early
			
 
				+requests instead of issuing a data request and blocking. The default value is 0,
			
 
				+issuing a data request. Because it is an early request and we do not know its
			
 
				+real priority, the data request will assume \ref STARPU_DEFAULT_PRIO. In cases
			
 
				+where there are many data requests with priorities greater than
			
 
				+\ref STARPU_DEFAULT_PRIO the MPI drive could be blocked for long periods.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_SIMGRID</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_SIMGRID
			
 
				+\addindex __env__STARPU_SIMGRID
			
 
				+When set to 1 (the default is 0), this makes StarPU check that it was really
			
 
				+build with simulation support. This is convenient in scripts to avoid using a
			
 
				+native version, that would try to update performance models...
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_SIMGRID_TRANSFER_COST
			
--- a/doc/doxygen/chapters/code/disk_copy.c
+++ b/doc/doxygen/chapters/code/disk_copy.c
@@ -33,7 +33,7 @@
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	double * A,*B,*C,*D,*E,*F;
			
 
				+	double *A, *F;
			
 
				 
			
 
				 	/* limit main ram to force to push in disk */
			
 
				 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);
			
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -106,6 +106,7 @@ examplebin_PROGRAMS =
 
				 noinst_HEADERS = 				\
			
 
				 	axpy/axpy.h                             \
			
 
				 	cg/cg.h					\
			
 
				+	cg/cg_kernels.c				\
			
 
				 	heat/lu_kernels_model.h			\
			
 
				 	heat/dw_sparse_cg.h			\
			
 
				 	heat/heat.h				\
			
@@ -869,7 +870,6 @@ if !STARPU_NO_BLAS_LIB
 
				 
			
 
				 cg_cg_SOURCES =					\
			
 
				 	cg/cg.c					\
			
 
				-	cg/cg_kernels.c				\
			
 
				 	common/blas.c
			
 
				 
			
 
				 cg_cg_LDADD =					\
			
--- a/examples/basic_examples/multiformat_conversion_codelets.c
+++ b/examples/basic_examples/multiformat_conversion_codelets.c
@@ -41,6 +41,7 @@ struct starpu_codelet cpu_to_cuda_cl =
 
				 	.cuda_funcs = {cpu_to_cuda_cuda_func},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 	.name = "codelet_cpu_to_cuda"
			
 
				 };
			
 
				 
			
@@ -48,6 +49,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 
				 {
			
 
				 	.cpu_funcs = {cuda_to_cpu},
			
 
				 	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 	.name = "codelet_cude_to_cpu"
			
 
				 };
			
 
				 #endif
			
@@ -73,12 +75,14 @@ struct starpu_codelet cpu_to_opencl_cl =
 
				 {
			
 
				 	.opencl_funcs = {cpu_to_opencl_opencl_func},
			
 
				 	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 };
			
 
				 
			
 
				 struct starpu_codelet opencl_to_cpu_cl =
			
 
				 {
			
 
				 	.cpu_funcs = {opencl_to_cpu},
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 };
			
 
				 #endif
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -19,11 +19,6 @@
 
				 #include <starpu.h>
			
 
				 #include <common/blas.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cuda.h>
			
 
				-#endif
			
 
				-
			
 
				-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				 
			
 
				 /*
			
 
				  *	Conjugate Gradient
			
@@ -68,32 +63,34 @@
 
				 
			
 
				 #include "cg.h"
			
 
				 
			
 
				-static int long long n = 4096;
			
 
				-static int nblocks = 8;
			
 
				-static int use_reduction = 1;
			
 
				+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks);
			
 
				 
			
 
				-static starpu_data_handle_t A_handle, b_handle, x_handle;
			
 
				-static TYPE *A, *b, *x;
			
 
				+#define HANDLE_TYPE_VECTOR starpu_data_handle_t
			
 
				+#define HANDLE_TYPE_MATRIX starpu_data_handle_t
			
 
				+#define TASK_INSERT(cl, ...) starpu_task_insert(cl, ##__VA_ARGS__)
			
 
				+#define GET_VECTOR_BLOCK(v, i) starpu_data_get_sub_data(v, 1, i)
			
 
				+#define GET_MATRIX_BLOCK(m, i, j) starpu_data_get_sub_data(m, 2, i, j)
			
 
				+#define BARRIER()
			
 
				+#define GET_DATA_HANDLE(handle)
			
 
				+#define FPRINTF_SERVER FPRINTF
			
 
				+
			
 
				+#include "cg_kernels.c"
			
 
				 
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-static int i_max = 5;
			
 
				-#elif !defined(STARPU_LONG_CHECK)
			
 
				-static int i_max = 100;
			
 
				-#else
			
 
				-static int i_max = 1000;
			
 
				-#endif
			
 
				-static double eps = (10e-14);
			
 
				 
			
 
				-static starpu_data_handle_t r_handle, d_handle, q_handle;
			
 
				+
			
 
				+static TYPE *A, *b, *x;
			
 
				 static TYPE *r, *d, *q;
			
 
				 
			
 
				-static starpu_data_handle_t dtq_handle, rtr_handle;
			
 
				-static TYPE dtq, rtr;
			
 
				 
			
 
				-extern struct starpu_codelet accumulate_variable_cl;
			
 
				-extern struct starpu_codelet accumulate_vector_cl;
			
 
				-extern struct starpu_codelet bzero_variable_cl;
			
 
				-extern struct starpu_codelet bzero_vector_cl;
			
 
				+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
			
 
				+{
			
 
				+	unsigned b;
			
 
				+
			
 
				+	for (b = 0; b < nblocks; b++)
			
 
				+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 
			
 
				 /*
			
 
				  *	Generate Input data
			
@@ -264,162 +261,48 @@ static void display_matrix(void)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-/*
			
 
				- *	Main loop
			
 
				- */
			
 
				-
			
 
				-static int cg(void)
			
 
				+static void display_x_result(void)
			
 
				 {
			
 
				-	double delta_new, delta_0;
			
 
				-
			
 
				-	int i = 0;
			
 
				-	int ret;
			
 
				+	int j, i;
			
 
				+	starpu_data_handle_t sub;
			
 
				 
			
 
				-	/* r <- b */
			
 
				-	ret = copy_handle(r_handle, b_handle, nblocks);
			
 
				-	if (ret == -ENODEV) return ret;
			
 
				+	FPRINTF(stderr, "Computed X vector:\n");
			
 
				 
			
 
				-	/* r <- r - A x */
			
 
				-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
			
 
				-	if (ret == -ENODEV) return ret;
			
 
				+	int block_size = n / nblocks;
			
 
				 
			
 
				-	/* d <- r */
			
 
				-	ret = copy_handle(d_handle, r_handle, nblocks);
			
 
				-	if (ret == -ENODEV) return ret;
			
 
				-
			
 
				-	/* delta_new = dot(r,r) */
			
 
				-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				-	if (ret == -ENODEV) return ret;
			
 
				-
			
 
				-	starpu_data_acquire(rtr_handle, STARPU_R);
			
 
				-	delta_new = rtr;
			
 
				-	delta_0 = delta_new;
			
 
				-	starpu_data_release(rtr_handle);
			
 
				-
			
 
				-	FPRINTF(stderr, "*************** INITIAL ************ \n");
			
 
				-	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
			
 
				-
			
 
				-	double start;
			
 
				-	double end;
			
 
				-	start = starpu_timing_now();
			
 
				-
			
 
				-	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				 	{
			
 
				-		double delta_old;
			
 
				-		double alpha, beta;
			
 
				-
			
 
				-		starpu_iteration_push(i);
			
 
				-
			
 
				-		/* q <- A d */
			
 
				-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
			
 
				-
			
 
				-		/* dtq <- dot(d,q) */
			
 
				-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
			
 
				-
			
 
				-		/* alpha = delta_new / dtq */
			
 
				-		starpu_data_acquire(dtq_handle, STARPU_R);
			
 
				-		alpha = delta_new/dtq;
			
 
				-		starpu_data_release(dtq_handle);
			
 
				-
			
 
				-		/* x <- x + alpha d */
			
 
				-		axpy_kernel(x_handle, d_handle, alpha, nblocks);
			
 
				-
			
 
				-		if ((i % 50) == 0)
			
 
				-		{
			
 
				-			/* r <- b */
			
 
				-			copy_handle(r_handle, b_handle, nblocks);
			
 
				-
			
 
				-			/* r <- r - A x */
			
 
				-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			/* r <- r - alpha q */
			
 
				-			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
			
 
				-		}
			
 
				-
			
 
				-		/* delta_new = dot(r,r) */
			
 
				-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				-
			
 
				-		starpu_data_acquire(rtr_handle, STARPU_R);
			
 
				-		delta_old = delta_new;
			
 
				-		delta_new = rtr;
			
 
				-		beta = delta_new / delta_old;
			
 
				-		starpu_data_release(rtr_handle);
			
 
				-
			
 
				-		/* d <- beta d + r */
			
 
				-		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
			
 
				-
			
 
				-		if ((i % 10) == 0)
			
 
				+		sub = starpu_data_get_sub_data(x_handle, 1, j);
			
 
				+		starpu_data_acquire(sub, STARPU_R);
			
 
				+		for (i = 0; i < block_size; i++)
			
 
				 		{
			
 
				-			/* We here take the error as ||r||_2 / (n||b||_2) */
			
 
				-			double error = sqrt(delta_new/delta_0)/(1.0*n);
			
 
				-			FPRINTF(stderr, "*****************************************\n");
			
 
				-			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				+			FPRINTF(stderr, "% 02.2e\n", x[j*block_size + i]);
			
 
				 		}
			
 
				-
			
 
				-		starpu_iteration_pop();
			
 
				-		i++;
			
 
				+		starpu_data_release(sub);
			
 
				 	}
			
 
				-
			
 
				-	end = starpu_timing_now();
			
 
				-
			
 
				-	double timing = end - start;
			
 
				-	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
			
 
				-	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
			
 
				-	return 0;
			
 
				 }
			
 
				 
			
 
				-static int check(void)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				 
			
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				 	for (i = 1; i < argc; i++)
			
 
				 	{
			
 
				-	        if (strcmp(argv[i], "-n") == 0)
			
 
				-		{
			
 
				-			n = (int long long)atoi(argv[++i]);
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-	        if (strcmp(argv[i], "-maxiter") == 0)
			
 
				-		{
			
 
				-			i_max = atoi(argv[++i]);
			
 
				-			if (i_max <= 0)
			
 
				-			{
			
 
				-				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
			
 
				-				exit(EXIT_FAILURE);
			
 
				-			}
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-	        if (strcmp(argv[i], "-nblocks") == 0)
			
 
				-		{
			
 
				-			nblocks = atoi(argv[++i]);
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-	        if (strcmp(argv[i], "-no-reduction") == 0)
			
 
				-		{
			
 
				-			use_reduction = 0;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
			
 
				 		{
			
 
				-			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				 			exit(-1);
			
 
				 		}
			
 
				-        }
			
 
				+	}
			
 
				+
			
 
				+	parse_common_args(argc, argv);
			
 
				 }
			
 
				 
			
 
				+
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret;
			
 
				+	double start, end;
			
 
				 
			
 
				 	/* Not supported yet */
			
 
				 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
			
@@ -434,9 +317,19 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				+	FPRINTF(stderr, "************** PARAMETERS ***************\n");
			
 
				+	FPRINTF(stderr, "Problem size (-n): %lld\n", n);
			
 
				+	FPRINTF(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
			
 
				+	FPRINTF(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
			
 
				+	FPRINTF(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
			
 
				+
			
 
				+	start = starpu_timing_now();
			
 
				 	generate_random_problem();
			
 
				 	register_data();
			
 
				 	partition_data();
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/10e6);
			
 
				 
			
 
				 	ret = cg();
			
 
				 	if (ret == -ENODEV)
			
@@ -445,10 +338,13 @@ int main(int argc, char **argv)
 
				 		goto enodev;
			
 
				 	}
			
 
				 
			
 
				-	ret = check();
			
 
				-
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				+	if (display_result)
			
 
				+	{
			
 
				+		display_x_result();
			
 
				+	}
			
 
				+
			
 
				 enodev:
			
 
				 	unregister_data();
			
 
				 	free_data();
			
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -54,29 +54,4 @@
 
				 #define cublasscal	cublasSscal
			
 
				 #endif
			
 
				 
			
 
				-int dot_kernel(starpu_data_handle_t v1,
			
 
				-	       starpu_data_handle_t v2,
			
 
				-	       starpu_data_handle_t s,
			
 
				-	       unsigned nblocks,
			
 
				-	       int use_reduction);
			
 
				-
			
 
				-int gemv_kernel(starpu_data_handle_t v1,
			
 
				-                starpu_data_handle_t matrix, 
			
 
				-                starpu_data_handle_t v2,
			
 
				-                TYPE p1, TYPE p2,
			
 
				-		unsigned nblocks,
			
 
				-		int use_reduction);
			
 
				-
			
 
				-int axpy_kernel(starpu_data_handle_t v1,
			
 
				-		starpu_data_handle_t v2, TYPE p1,
			
 
				-		unsigned nblocks);
			
 
				-
			
 
				-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
			
 
				-		     starpu_data_handle_t v2, TYPE p2,
			
 
				-		     unsigned nblocks);
			
 
				-
			
 
				-int copy_handle(starpu_data_handle_t dst,
			
 
				-		starpu_data_handle_t src,
			
 
				-		unsigned nblocks);
			
 
				-
			
 
				 #endif /* __STARPU_EXAMPLE_CG_H__ */
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -23,11 +23,43 @@
 
				 #include <limits.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				 #include <starpu_cublas_v2.h>
			
 
				 static const TYPE gp1 = 1.0;
			
 
				 static const TYPE gm1 = -1.0;
			
 
				 #endif
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+static int nblocks = 8;
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+static int i_max = 5;
			
 
				+static int long long n = 2048;
			
 
				+#elif !defined(STARPU_LONG_CHECK)
			
 
				+static int long long n = 4096;
			
 
				+static int i_max = 100;
			
 
				+#else
			
 
				+static int long long n = 4096;
			
 
				+static int i_max = 1000;
			
 
				+#endif
			
 
				+static double eps = (10e-14);
			
 
				+
			
 
				+int use_reduction = 1;
			
 
				+int display_result = 0;
			
 
				+
			
 
				+HANDLE_TYPE_MATRIX A_handle;
			
 
				+HANDLE_TYPE_VECTOR b_handle;
			
 
				+HANDLE_TYPE_VECTOR x_handle;
			
 
				+
			
 
				+HANDLE_TYPE_VECTOR r_handle;
			
 
				+HANDLE_TYPE_VECTOR d_handle;
			
 
				+HANDLE_TYPE_VECTOR q_handle;
			
 
				+
			
 
				+starpu_data_handle_t dtq_handle;
			
 
				+starpu_data_handle_t rtr_handle;
			
 
				+TYPE dtq, rtr;
			
 
				+
			
 
				 #if 0
			
 
				 static void print_vector_from_descr(unsigned nx, TYPE *v)
			
 
				 {
			
@@ -120,7 +152,7 @@ struct starpu_codelet accumulate_variable_cl =
 
				 	.cuda_funcs = {accumulate_variable_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_variable_model
			
 
				 };
			
@@ -164,7 +196,7 @@ struct starpu_codelet accumulate_vector_cl =
 
				 	.cuda_funcs = {accumulate_vector_cuda},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_vector_model
			
 
				 };
			
@@ -314,8 +346,8 @@ static struct starpu_codelet dot_kernel_cl =
 
				 	.model = &dot_kernel_model
			
 
				 };
			
 
				 
			
 
				-int dot_kernel(starpu_data_handle_t v1,
			
 
				-	       starpu_data_handle_t v2,
			
 
				+int dot_kernel(HANDLE_TYPE_VECTOR v1,
			
 
				+	       HANDLE_TYPE_VECTOR v2,
			
 
				 	       starpu_data_handle_t s,
			
 
				 	       unsigned nblocks,
			
 
				 	       int use_reduction)
			
@@ -327,21 +359,21 @@ int dot_kernel(starpu_data_handle_t v1,
 
				 		starpu_data_invalidate_submit(s);
			
 
				 	else
			
 
				 	{
			
 
				-		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+		ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 	}
			
 
				 
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		ret = starpu_task_insert(&dot_kernel_cl,
			
 
				+		ret = TASK_INSERT(&dot_kernel_cl,
			
 
				 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				-					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				-					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
			
 
				+					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
			
 
				 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
			
 
				 					 0);
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
@@ -477,9 +509,9 @@ static struct starpu_codelet gemv_kernel_cl =
 
				 	.model = &gemv_kernel_model
			
 
				 };
			
 
				 
			
 
				-int gemv_kernel(starpu_data_handle_t v1,
			
 
				-		starpu_data_handle_t matrix,
			
 
				-		starpu_data_handle_t v2,
			
 
				+int gemv_kernel(HANDLE_TYPE_VECTOR v1,
			
 
				+		HANDLE_TYPE_MATRIX matrix,
			
 
				+		HANDLE_TYPE_VECTOR v2,
			
 
				 		TYPE p1, TYPE p2,
			
 
				 		unsigned nblocks,
			
 
				 		int use_reduction)
			
@@ -489,13 +521,13 @@ int gemv_kernel(starpu_data_handle_t v1,
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
 
				 	{
			
 
				-		ret = starpu_task_insert(&scal_kernel_cl,
			
 
				-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				+		ret = TASK_INSERT(&scal_kernel_cl,
			
 
				+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 	}
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
@@ -503,15 +535,15 @@ int gemv_kernel(starpu_data_handle_t v1,
 
				 		for (b1 = 0; b1 < nblocks; b1++)
			
 
				 		{
			
 
				 			TYPE one = 1.0;
			
 
				-			ret = starpu_task_insert(&gemv_kernel_cl,
			
 
				-						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				-						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				-						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				+			ret = TASK_INSERT(&gemv_kernel_cl,
			
 
				+						 use_reduction?STARPU_REDUX:STARPU_RW,	GET_VECTOR_BLOCK(v1, b2),
			
 
				+						 STARPU_R,	GET_MATRIX_BLOCK(matrix, b2, b1),
			
 
				+						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
			
 
				 						 STARPU_VALUE,	&one,	sizeof(one),
			
 
				 						 STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
			
 
				 						 0);
			
 
				-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 		}
			
 
				 	}
			
 
				 	return 0;
			
@@ -582,23 +614,23 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 
				 	.model = &scal_axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
			
 
				-		     starpu_data_handle_t v2, TYPE p2,
			
 
				+int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
			
 
				+		     HANDLE_TYPE_VECTOR v2, TYPE p2,
			
 
				 		     unsigned nblocks)
			
 
				 {
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				 		int ret;
			
 
				-		ret = starpu_task_insert(&scal_axpy_kernel_cl,
			
 
				-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+		ret = TASK_INSERT(&scal_axpy_kernel_cl,
			
 
				+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
			
 
				+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 STARPU_VALUE, &p2, sizeof(p2),
			
 
				 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
@@ -661,30 +693,177 @@ static struct starpu_codelet axpy_kernel_cl =
 
				 	.model = &axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-int axpy_kernel(starpu_data_handle_t v1,
			
 
				-		starpu_data_handle_t v2, TYPE p1,
			
 
				+int axpy_kernel(HANDLE_TYPE_VECTOR v1,
			
 
				+		HANDLE_TYPE_VECTOR v2, TYPE p1,
			
 
				 		unsigned nblocks)
			
 
				 {
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				 		int ret;
			
 
				-		ret = starpu_task_insert(&axpy_kernel_cl,
			
 
				-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+		ret = TASK_INSERT(&axpy_kernel_cl,
			
 
				+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
			
 
				+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
			
 
				 					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return ret;
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
			
 
				+
			
 
				+/*
			
 
				+ *	Main loop
			
 
				+ */
			
 
				+int cg(void)
			
 
				 {
			
 
				-	unsigned b;
			
 
				-	for (b = 0; b < nblocks; b++)
			
 
				-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
			
 
				+	TYPE delta_new, delta_0, error, delta_old, alpha, beta;
			
 
				+	double start, end, timing;
			
 
				+	int i = 0, ret;
			
 
				+
			
 
				+	/* r <- b */
			
 
				+	ret = copy_handle(r_handle, b_handle, nblocks);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+
			
 
				+	/* r <- r - A x */
			
 
				+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+
			
 
				+	/* d <- r */
			
 
				+	ret = copy_handle(d_handle, r_handle, nblocks);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+
			
 
				+	/* delta_new = dot(r,r) */
			
 
				+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+
			
 
				+	GET_DATA_HANDLE(rtr_handle);
			
 
				+	starpu_data_acquire(rtr_handle, STARPU_R);
			
 
				+	delta_new = rtr;
			
 
				+	delta_0 = delta_new;
			
 
				+	starpu_data_release(rtr_handle);
			
 
				+
			
 
				+	FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
			
 
				+
			
 
				+	FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
			
 
				+	FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
			
 
				+
			
 
				+	BARRIER();
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
			
 
				+	{
			
 
				+		starpu_iteration_push(i);
			
 
				+
			
 
				+		/* q <- A d */
			
 
				+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
			
 
				+
			
 
				+		/* dtq <- dot(d,q) */
			
 
				+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
			
 
				+
			
 
				+		/* alpha = delta_new / dtq */
			
 
				+		GET_DATA_HANDLE(dtq_handle);
			
 
				+		starpu_data_acquire(dtq_handle, STARPU_R);
			
 
				+		alpha = delta_new / dtq;
			
 
				+		starpu_data_release(dtq_handle);
			
 
				+
			
 
				+		/* x <- x + alpha d */
			
 
				+		axpy_kernel(x_handle, d_handle, alpha, nblocks);
			
 
				+
			
 
				+		if ((i % 50) == 0)
			
 
				+		{
			
 
				+			/* r <- b */
			
 
				+			copy_handle(r_handle, b_handle, nblocks);
			
 
				+
			
 
				+			/* r <- r - A x */
			
 
				+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* r <- r - alpha q */
			
 
				+			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
			
 
				+		}
			
 
				+
			
 
				+		/* delta_new = dot(r,r) */
			
 
				+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				+
			
 
				+		GET_DATA_HANDLE(rtr_handle);
			
 
				+		starpu_data_acquire(rtr_handle, STARPU_R);
			
 
				+		delta_old = delta_new;
			
 
				+		delta_new = rtr;
			
 
				+		beta = delta_new / delta_old;
			
 
				+		starpu_data_release(rtr_handle);
			
 
				+
			
 
				+		/* d <- beta d + r */
			
 
				+		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
			
 
				+
			
 
				+		if ((i % 10) == 0)
			
 
				+		{
			
 
				+			/* We here take the error as ||r||_2 / (n||b||_2) */
			
 
				+			error = sqrt(delta_new/delta_0)/(1.0*n);
			
 
				+			FPRINTF_SERVER(stderr, "*****************************************\n");
			
 
				+			FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+		i++;
			
 
				+	}
			
 
				+
			
 
				+	BARRIER();
			
 
				+	end = starpu_timing_now();
			
 
				+	timing = end - start;
			
 
				+
			
 
				+	error = sqrt(delta_new/delta_0)/(1.0*n);
			
 
				+	FPRINTF_SERVER(stderr, "*****************************************\n");
			
 
				+	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
			
 
				+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
			
 
				+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+void parse_common_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-n") == 0)
			
 
				+		{
			
 
				+			n = (int long long)atoi(argv[++i]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-display-result") == 0)
			
 
				+		{
			
 
				+			display_result = 1;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-maxiter") == 0)
			
 
				+		{
			
 
				+			i_max = atoi(argv[++i]);
			
 
				+			if (i_max <= 0)
			
 
				+			{
			
 
				+				FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
			
 
				+				exit(EXIT_FAILURE);
			
 
				+			}
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+			nblocks = atoi(argv[++i]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-no-reduction") == 0)
			
 
				+		{
			
 
				+			use_reduction = 0;
			
 
				+			continue;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/examples/pi/pi_redux.c
+++ b/examples/pi/pi_redux.c
@@ -322,7 +322,7 @@ static struct starpu_codelet redux_codelet =
 
				 	.cuda_funcs = {redux_cuda_func},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -211,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 
				 	.opencl_funcs = {redux_opencl_func},
			
 
				 	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				 #endif
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2,
			
 
				 	.name = "redux"
			
 
				 };
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 
				 {
			
 
				 	.cpu_funcs = {minmax_redux_cpu_func},
			
 
				 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2,
			
 
				 	.name = "redux"
			
 
				 };
			
--- a/include/fstarpu_mod.f90
+++ b/include/fstarpu_mod.f90
@@ -25,6 +25,7 @@ module fstarpu_mod
 
				         type(c_ptr), bind(C) :: FSTARPU_RW
			
 
				         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
			
 
				         type(c_ptr), bind(C) :: FSTARPU_REDUX
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_MPI_REDUX
			
 
				         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_SSEND
			
 
				         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
			
@@ -36,11 +37,15 @@ module fstarpu_mod
 
				         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
			
 
				         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
			
 
				         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG_NFREE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG_NFREE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
			
 
				         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
			
 
				         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
			
 
				         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
			
@@ -2395,6 +2400,7 @@ module fstarpu_mod
 
				                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
			
 
				                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
			
 
				                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
			
 
				+                        FSTARPU_MPI_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_MPI_REDUX"//C_NULL_CHAR)
			
 
				                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
			
 
				                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
			
 
				                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
			
@@ -2406,12 +2412,19 @@ module fstarpu_mod
 
				                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
			
 
				                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
			
 
				                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
			
 
				+                        FSTARPU_CALLBACK_WITH_ARG_NFREE       = &
			
 
				+                                fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG_NFREE"//C_NULL_CHAR)
			
 
				                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
			
 
				+                        FSTARPU_CALLBACK_ARG_NFREE    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
			
 
				                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
			
 
				                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
			
 
				+                        FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE   = &
			
 
				+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
			
 
				                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
			
 
				                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
			
 
				                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
			
 
				+                        FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE       = &
			
 
				+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"//C_NULL_CHAR)
			
 
				                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
			
 
				                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
			
 
				                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -471,6 +471,14 @@ struct starpu_conf
 
				 	   Maximum spinning backoff of drivers. Default value: \c 32
			
 
				 	 */
			
 
				 	unsigned driver_spinning_backoff_max;
			
 
				+
			
 
				+	/**
			
 
				+	   Specify if CUDA workers should do only fast allocations
			
 
				+	   when running the datawizard progress of
			
 
				+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				+	   Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+	 */
			
 
				+	int cuda_only_fast_alloc_other_memnodes;
			
 
				 };
			
 
				 
			
 
				 /**
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -110,7 +110,15 @@ enum starpu_data_access_mode
 
				 				   src/sched_policies/work_stealing_policy.c
			
 
				 				   source code.
			
 
				 				*/
			
 
				-	STARPU_ACCESS_MODE_MAX=(1<<7) /**< todo */
			
 
				+	STARPU_MPI_REDUX=(1<<7), /** Inter-node reduction only. Codelets 
			
 
				+				    contributing to these reductions should
			
 
				+				    be registered with STARPU_RW | STARPU_COMMUTE 
			
 
				+				    access modes.
			
 
				+			            When inserting these tasks through the
			
 
				+				    MPI layer however, the access mode needs
			
 
				+				    to be STARPU_MPI_REDUX. */
			
 
				+	STARPU_ACCESS_MODE_MAX=(1<<8) /** The purpose of ACCESS_MODE_MAX is to
			
 
				+					be the maximum of this enum. */
			
 
				 };
			
 
				 
			
 
				 struct starpu_data_interface_ops;
			
@@ -305,7 +313,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
				 
			
 
				    This is a very internal interface, subject to changes, do not use this.
			
 
				 */
			
 
				-int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid);
			
 
				+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid, int prio);
			
 
				 
			
 
				 /**
			
 
				    The application can call this function instead of starpu_data_acquire() so as to
			
@@ -560,8 +568,10 @@ struct starpu_codelet;
 
				 /**
			
 
				    Set the codelets to be used for \p handle when it is accessed in the
			
 
				    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
			
 
				-   the codelet \p init_cl, and reduction between per-worker buffers will be
			
 
				-   done with the codelet \p redux_cl.
			
 
				+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
			
 
				+   reduction between per-worker buffers will be done with the codelet \p
			
 
				+   redux_cl (which has to take a first accumulation handle with
			
 
				+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
			
 
				 */
			
 
				 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
			
 
				 
			
--- a/include/starpu_hash.h
+++ b/include/starpu_hash.h
@@ -39,6 +39,14 @@ extern "C"
 
				 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
			
 
				 
			
 
				 /**
			
 
				+   Compute the CRC of a pointer value seeded by the \p inputcrc
			
 
				+   <em>current state</em>. The return value should be considered as the new
			
 
				+   <em>current state</em> for future CRC computation. This is used for computing
			
 
				+   data size footprint.
			
 
				+*/
			
 
				+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc);
			
 
				+
			
 
				+/**
			
 
				    Compute the CRC of a 32bit number seeded by the \p inputcrc
			
 
				    <em>current state</em>. The return value should be considered as the new
			
 
				    <em>current state</em> for future CRC computation. This is used for computing
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -310,10 +310,10 @@ struct starpu_perfmodel
 
				 void starpu_perfmodel_init(struct starpu_perfmodel *model);
			
 
				 
			
 
				 /**
			
 
				-   Deinitialize the \p model performance model structure. You need to call this 
			
 
				-   before deallocating the structure. You will probably want to call 
			
 
				+   Deinitialize the \p model performance model structure. You need to call this
			
 
				+   before deallocating the structure. You will probably want to call
			
 
				    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
			
 
				-*/   
			
 
				+*/
			
 
				 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
			
 
				 
			
 
				 /**
			
@@ -322,7 +322,6 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
				    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
			
 
				    - \p archi is the type of architecture on which calibration will be run
			
 
				 */
			
 
				-
			
 
				 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
			
 
				 
			
 
				 /**
			
@@ -335,7 +334,6 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
				    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
			
 
				    - \p archi is the type of architecture on which calibration was run
			
 
				 */
			
 
				-
			
 
				 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
			
 
				 
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -861,7 +861,28 @@ struct starpu_task
 
				 	*/
			
 
				 	void *prologue_callback_arg;
			
 
				 
			
 
				+	/** Optional field, the default value is <c>NULL</c>. This is a
			
 
				+	   function pointer of prototype <c>void (*f)(void*)</c>
			
 
				+	   which specifies a possible callback. If this pointer is
			
 
				+	   non-<c>NULL</c>, the callback function is executed on the host
			
 
				+	   when the task is pop-ed from the scheduler, just before getting
			
 
				+	   executed. The callback is passed the value contained in the
			
 
				+	   starpu_task::prologue_callback_pop_arg field.
			
 
				+	   No callback is executed if the field is set to <c>NULL</c>.
			
 
				+
			
 
				+	   With starpu_task_insert() and alike this can be specified thanks to
			
 
				+	   ::STARPU_PROLOGUE_CALLBACK_POP followed by the function pointer.
			
 
				+	*/
			
 
				 	void (*prologue_callback_pop_func)(void *);
			
 
				+	/**
			
 
				+	   Optional field, the default value is <c>NULL</c>. This is
			
 
				+	   the pointer passed to the prologue_callback_pop function. This
			
 
				+	   field is ignored if the field
			
 
				+	   starpu_task::prologue_callback_pop_func is set to <c>NULL</c>.
			
 
				+
			
 
				+	   With starpu_task_insert() and alike this can be specified thanks to
			
 
				+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG followed by the argument.
			
 
				+	   */
			
 
				 	void *prologue_callback_pop_arg;
			
 
				 
			
 
				 	/**
			
@@ -1424,8 +1445,13 @@ struct starpu_task
 
				 	do {								\
			
 
				 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
			
 
				 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
			
 
				-		else							\
			
 
				-			STARPU_CODELET_SET_MODE((task)->cl, mode, i);	\
			
 
				+		else \
			
 
				+		{							\
			
 
				+			enum starpu_data_access_mode cl_mode = STARPU_CODELET_GET_MODE((task)->cl, i); \
			
 
				+			STARPU_ASSERT_MSG(cl_mode == mode,	\
			
 
				+				"Task <%s> can't set its  %d-th buffer mode to %d as the codelet it derives from uses %d", \
			
 
				+				(task)->cl->name, i, mode, cl_mode);	\
			
 
				+		} \
			
 
				 	} while(0)
			
 
				 
			
 
				 /**
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -257,6 +257,10 @@ extern "C"
 
				 	_starpu_abort();				\
			
 
				 } while(0)
			
 
				 
			
 
				+#if defined(_MSC_VER)
			
 
				+  #undef STARPU_HAVE_STRERROR_R
			
 
				+#endif
			
 
				+
			
 
				 #if defined(STARPU_HAVE_STRERROR_R)
			
 
				 #if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
			
 
				 /* XSI-compliant version of strerror_r returns an int */
			
--- a/julia/README
+++ b/julia/README
@@ -20,8 +20,8 @@ $ make
 
				 Then, you need to add the lib/ directory to your library path and the julia/
			
 
				 directory to your Julia load path:
			
 
				 
			
 
				-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
			
 
				-$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
			
 
				+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/src/.lib
			
 
				+$ export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
			
 
				 
			
 
				 This step can also be done by sourcing the setenv.sh script:
			
 
				 
			
--- a/julia/examples/execute.sh.in
+++ b/julia/examples/execute.sh.in
@@ -1,7 +1,7 @@
 
				 #!@REALBASH@
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,7 +16,7 @@
 
				 #
			
 
				 
			
 
				 set -x
			
 
				-export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
			
 
				+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia/src:$JULIA_LOAD_PATH
			
 
				 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
			
 
				 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
			
 
				 export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
			
--- a/julia/setenv.sh
+++ b/julia/setenv.sh
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -13,7 +13,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 #
			
 
				-export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
			
 
				+export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
			
 
				 
			
 
				 if [ `uname` == "Darwin" ]; then
			
 
				     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$PWD/lib/
			
--- a/julia/src/StarPU.jl
+++ b/julia/src/StarPU.jl
@@ -65,7 +65,7 @@ export STARPU_HISTORY_BASED, STARPU_REGRESSION_BASED
 
				 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
			
 
				 export starpu_tag_t
			
 
				 export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
			
 
				-export STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
			
 
				+export STARPU_MPI_REDUX, STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
			
 
				 export STARPU_ACCESS_MODE_MAX
			
 
				 
			
 
				 # BLAS
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -272,9 +272,27 @@ starpu_mpi_EXAMPLES +=				\
 
				 	matrix_decomposition/mpi_cholesky_distributed
			
 
				 endif
			
 
				 
			
 
				-########################
			
 
				+##############
			
 
				+# CG example #
			
 
				+##############
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS += cg/cg
			
 
				+starpu_mpi_EXAMPLES += cg/cg
			
 
				+
			
 
				+cg_cg_SOURCES =					\
			
 
				+	cg/cg.c						\
			
 
				+	../../examples/common/blas.c
			
 
				+
			
 
				+cg_cg_LDADD =					\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+###########################
			
 
				 # MPI Matrix mult example #
			
 
				-########################
			
 
				+###########################
			
 
				 
			
 
				 examplebin_PROGRAMS +=		\
			
 
				 	matrix_mult/mm
			
@@ -290,6 +308,24 @@ starpu_mpi_EXAMPLES +=				\
 
				 	matrix_mult/mm
			
 
				 endif
			
 
				 
			
 
				+########################
			
 
				+# MPI STARPU_MPI_REDUX #
			
 
				+########################
			
 
				+
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	mpi_redux/mpi_redux
			
 
				+
			
 
				+mpi_redux_mpi_redux_SOURCES	=		\
			
 
				+	mpi_redux/mpi_redux.c
			
 
				+
			
 
				+mpi_redux_mpi_redux_LDADD =			\
			
 
				+	-lm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	mpi_redux/mpi_redux
			
 
				+endif
			
 
				+
			
 
				 ##########################################
			
 
				 # Native Fortran MPI Matrix mult example #
			
 
				 ##########################################
			
@@ -336,6 +372,55 @@ endif
 
				 endif
			
 
				 endif
			
 
				 
			
 
				+########################################
			
 
				+# Native Fortran MPI STARPU_REDUX test #
			
 
				+########################################
			
 
				+
			
 
				+if STARPU_HAVE_MPIFORT
			
 
				+if !STARPU_SANITIZE
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	native_fortran/nf_mpi_redux
			
 
				+
			
 
				+native_fortran_nf_mpi_redux_SOURCES	=			\
			
 
				+	native_fortran/fstarpu_mpi_mod.f90	\
			
 
				+	native_fortran/fstarpu_mod.f90		\
			
 
				+	native_fortran/nf_mpi_redux.f90	
			
 
				+
			
 
				+native_fortran_nf_mpi_redux_LDADD =					\
			
 
				+	-lm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	native_fortran/nf_mpi_redux
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+########################################
			
 
				+# Native Fortran MPI STARPU_REDUX test #
			
 
				+########################################
			
 
				+
			
 
				+if STARPU_HAVE_MPIFORT
			
 
				+if !STARPU_SANITIZE
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	native_fortran/nf_redux_test
			
 
				+
			
 
				+native_fortran_nf_redux_test_SOURCES	=			\
			
 
				+	native_fortran/fstarpu_mpi_mod.f90	\
			
 
				+	native_fortran/fstarpu_mod.f90		\
			
 
				+	native_fortran/nf_redux_test.f90	
			
 
				+
			
 
				+native_fortran_nf_redux_test_LDADD =					\
			
 
				+	-lm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	native_fortran/nf_redux_test
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+
			
 
				 ###################
			
 
				 # complex example #
			
 
				 ###################
			
@@ -427,6 +512,8 @@ native_fortran/nf_mm_cl.o: fstarpu_mod.mod
 
				 native_fortran/nf_mm.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				 native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				+native_fortran/nf_redux_test.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				+native_fortran/nf_mpi_redux.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				 endif
			
 
				 endif
			
 
				 
			
--- a/mpi/examples/cg/cg.c
+++ b/mpi/examples/cg/cg.c
@@ -0,0 +1,422 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <assert.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <common/blas.h>
			
 
				+
			
 
				+/*
			
 
				+ * Distributed version of Conjugate Gradient implemented in examples/cg/cg.c
			
 
				+ *
			
 
				+ * Use -display-result option and compare with the non-distributed version: the
			
 
				+ * x vector should be the same.
			
 
				+ */
			
 
				+
			
 
				+#include "../../../examples/cg/cg.h"
			
 
				+
			
 
				+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks);
			
 
				+
			
 
				+#define HANDLE_TYPE_VECTOR starpu_data_handle_t*
			
 
				+#define HANDLE_TYPE_MATRIX starpu_data_handle_t**
			
 
				+#define TASK_INSERT(cl, ...) starpu_mpi_task_insert(MPI_COMM_WORLD, cl, ##__VA_ARGS__)
			
 
				+#define GET_VECTOR_BLOCK(v, i) v[i]
			
 
				+#define GET_MATRIX_BLOCK(m, i, j) m[i][j]
			
 
				+#define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+#define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
			
 
				+
			
 
				+static int block_size;
			
 
				+
			
 
				+static int rank;
			
 
				+static int nodes_p = 2;
			
 
				+static int nodes_q;
			
 
				+
			
 
				+static TYPE ***A;
			
 
				+static TYPE **x;
			
 
				+static TYPE **b;
			
 
				+
			
 
				+static TYPE **r;
			
 
				+static TYPE **d;
			
 
				+static TYPE **q;
			
 
				+
			
 
				+#define FPRINTF_SERVER(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT") && rank == 0) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+#include "../../../examples/cg/cg_kernels.c"
			
 
				+
			
 
				+static int my_distrib(const int y, const int x)
			
 
				+{
			
 
				+	return (y%nodes_q)*nodes_p + (x%nodes_p);
			
 
				+}
			
 
				+
			
 
				+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks)
			
 
				+{
			
 
				+	unsigned b;
			
 
				+
			
 
				+	for (b = 0; b < nblocks; b++)
			
 
				+	{
			
 
				+		if (rank == my_distrib(b, 0))
			
 
				+		{
			
 
				+			starpu_data_cpy(dst[b], src[b], /* asynchronous */ 1, /* without callback */ NULL, NULL);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Generate Input data
			
 
				+ */
			
 
				+static void generate_random_problem(void)
			
 
				+{
			
 
				+	unsigned nn, mm, m, n, mpi_rank;
			
 
				+
			
 
				+	A = malloc(nblocks * sizeof(TYPE **));
			
 
				+	x = malloc(nblocks * sizeof(TYPE *));
			
 
				+	b = malloc(nblocks * sizeof(TYPE *));
			
 
				+
			
 
				+	r = malloc(nblocks * sizeof(TYPE *));
			
 
				+	d = malloc(nblocks * sizeof(TYPE *));
			
 
				+	q = malloc(nblocks * sizeof(TYPE *));
			
 
				+
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+	{
			
 
				+		A[m] = malloc(nblocks * sizeof(TYPE*));
			
 
				+
			
 
				+		mpi_rank = my_distrib(m, 0);
			
 
				+
			
 
				+		if (mpi_rank == rank || display_result)
			
 
				+		{
			
 
				+			starpu_malloc((void**) &x[m], block_size*sizeof(TYPE));
			
 
				+		}
			
 
				+
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			starpu_malloc((void**) &b[m], block_size*sizeof(TYPE));
			
 
				+			starpu_malloc((void**) &r[m], block_size*sizeof(TYPE));
			
 
				+			starpu_malloc((void**) &d[m], block_size*sizeof(TYPE));
			
 
				+			starpu_malloc((void**) &q[m], block_size*sizeof(TYPE));
			
 
				+
			
 
				+			for (mm = 0; mm < block_size; mm++)
			
 
				+			{
			
 
				+				x[m][mm] = (TYPE) 0.0;
			
 
				+				b[m][mm] = (TYPE) 1.0;
			
 
				+				r[m][mm] = (TYPE) 0.0;
			
 
				+				d[m][mm] = (TYPE) 0.0;
			
 
				+				q[m][mm] = (TYPE) 0.0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (n = 0; n < nblocks; n++)
			
 
				+		{
			
 
				+			mpi_rank = my_distrib(m, n);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				starpu_malloc((void**) &A[m][n], block_size*block_size*sizeof(TYPE));
			
 
				+
			
 
				+				for (nn = 0; nn < block_size; nn++)
			
 
				+				{
			
 
				+					for (mm = 0; mm < block_size; mm++)
			
 
				+					{
			
 
				+						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
			
 
				+						A[m][n][mm + nn*block_size] = (TYPE) (1.0/(1.0+(nn+(m*block_size)+mm+(n*block_size))));
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void free_data(void)
			
 
				+{
			
 
				+	unsigned nn, mm, m, n, mpi_rank;
			
 
				+
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+	{
			
 
				+		mpi_rank = my_distrib(m, 0);
			
 
				+
			
 
				+		if (mpi_rank == rank || display_result)
			
 
				+		{
			
 
				+			starpu_free((void*) x[m]);
			
 
				+		}
			
 
				+
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			starpu_free((void*) b[m]);
			
 
				+			starpu_free((void*) r[m]);
			
 
				+			starpu_free((void*) d[m]);
			
 
				+			starpu_free((void*) q[m]);
			
 
				+		}
			
 
				+
			
 
				+		for (n = 0; n < nblocks; n++)
			
 
				+		{
			
 
				+			mpi_rank = my_distrib(m, n);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				starpu_free((void*) A[m][n]);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		free(A[m]);
			
 
				+	}
			
 
				+
			
 
				+	free(A);
			
 
				+	free(x);
			
 
				+	free(b);
			
 
				+	free(r);
			
 
				+	free(d);
			
 
				+	free(q);
			
 
				+}
			
 
				+
			
 
				+static void register_data(void)
			
 
				+{
			
 
				+	unsigned m, n;
			
 
				+	int mpi_rank;
			
 
				+	starpu_mpi_tag_t mpi_tag = 0;
			
 
				+
			
 
				+	A_handle = malloc(nblocks*sizeof(starpu_data_handle_t*));
			
 
				+	x_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+	b_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+	r_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+	{
			
 
				+		mpi_rank = my_distrib(m, 0);
			
 
				+		A_handle[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+		if (mpi_rank == rank || display_result)
			
 
				+		{
			
 
				+			starpu_vector_data_register(&x_handle[m], STARPU_MAIN_RAM, (uintptr_t) x[m], block_size, sizeof(TYPE));
			
 
				+		}
			
 
				+		else if (!display_result)
			
 
				+		{
			
 
				+			assert(mpi_rank != rank);
			
 
				+			starpu_vector_data_register(&x_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
			
 
				+		}
			
 
				+
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			starpu_vector_data_register(&b_handle[m], STARPU_MAIN_RAM, (uintptr_t) b[m], block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&r_handle[m], STARPU_MAIN_RAM, (uintptr_t) r[m], block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&d_handle[m], STARPU_MAIN_RAM, (uintptr_t) d[m], block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&q_handle[m], STARPU_MAIN_RAM, (uintptr_t) q[m], block_size, sizeof(TYPE));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_vector_data_register(&b_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&r_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&d_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
			
 
				+			starpu_vector_data_register(&q_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
			
 
				+		}
			
 
				+
			
 
				+		starpu_data_set_coordinates(x_handle[m], 1, m);
			
 
				+		starpu_mpi_data_register(x_handle[m], ++mpi_tag, mpi_rank);
			
 
				+		starpu_data_set_coordinates(b_handle[m], 1, m);
			
 
				+		starpu_mpi_data_register(b_handle[m], ++mpi_tag, mpi_rank);
			
 
				+		starpu_data_set_coordinates(r_handle[m], 1, m);
			
 
				+		starpu_mpi_data_register(r_handle[m], ++mpi_tag, mpi_rank);
			
 
				+		starpu_data_set_coordinates(d_handle[m], 1, m);
			
 
				+		starpu_mpi_data_register(d_handle[m], ++mpi_tag, mpi_rank);
			
 
				+		starpu_data_set_coordinates(q_handle[m], 1, m);
			
 
				+		starpu_mpi_data_register(q_handle[m], ++mpi_tag, mpi_rank);
			
 
				+
			
 
				+		if (use_reduction)
			
 
				+		{
			
 
				+			starpu_data_set_reduction_methods(q_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
			
 
				+			starpu_data_set_reduction_methods(r_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
			
 
				+		}
			
 
				+
			
 
				+		for (n = 0; n < nblocks; n++)
			
 
				+		{
			
 
				+			mpi_rank = my_distrib(m, n);
			
 
				+
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				starpu_matrix_data_register(&A_handle[m][n], STARPU_MAIN_RAM, (uintptr_t) A[m][n], block_size, block_size, block_size, sizeof(TYPE));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				starpu_matrix_data_register(&A_handle[m][n], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
			
 
				+			}
			
 
				+
			
 
				+			starpu_data_set_coordinates(A_handle[m][n], 2, n, m);
			
 
				+			starpu_mpi_data_register(A_handle[m][n], ++mpi_tag, mpi_rank);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_variable_data_register(&dtq_handle, STARPU_MAIN_RAM, (uintptr_t)&dtq, sizeof(TYPE));
			
 
				+	starpu_variable_data_register(&rtr_handle, STARPU_MAIN_RAM, (uintptr_t)&rtr, sizeof(TYPE));
			
 
				+	starpu_mpi_data_register(rtr_handle, ++mpi_tag, 0);
			
 
				+	starpu_mpi_data_register(dtq_handle, ++mpi_tag, 0);
			
 
				+
			
 
				+	if (use_reduction)
			
 
				+	{
			
 
				+		starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
			
 
				+		starpu_data_set_reduction_methods(rtr_handle, &accumulate_variable_cl, &bzero_variable_cl);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void unregister_data(void)
			
 
				+{
			
 
				+	unsigned m, n;
			
 
				+
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+	{
			
 
				+		starpu_data_unregister(x_handle[m]);
			
 
				+		starpu_data_unregister(b_handle[m]);
			
 
				+		starpu_data_unregister(r_handle[m]);
			
 
				+		starpu_data_unregister(d_handle[m]);
			
 
				+		starpu_data_unregister(q_handle[m]);
			
 
				+
			
 
				+		for (n = 0; n < nblocks; n++)
			
 
				+		{
			
 
				+			starpu_data_unregister(A_handle[m][n]);
			
 
				+		}
			
 
				+
			
 
				+		free(A_handle[m]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(dtq_handle);
			
 
				+	starpu_data_unregister(rtr_handle);
			
 
				+
			
 
				+	free(A_handle);
			
 
				+	free(x_handle);
			
 
				+	free(b_handle);
			
 
				+	free(r_handle);
			
 
				+	free(d_handle);
			
 
				+	free(q_handle);
			
 
				+}
			
 
				+
			
 
				+static void display_x_result(void)
			
 
				+{
			
 
				+	int j, i;
			
 
				+
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		starpu_mpi_get_data_on_node(MPI_COMM_WORLD, x_handle[j], 0);
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		FPRINTF_SERVER(stderr, "Computed X vector:\n");
			
 
				+		for (j = 0; j < nblocks; j++)
			
 
				+		{
			
 
				+			starpu_data_acquire(x_handle[j], STARPU_R);
			
 
				+			for (i = 0; i < block_size; i++)
			
 
				+			{
			
 
				+				FPRINTF(stderr, "% 02.2e\n", x[j][i]);
			
 
				+			}
			
 
				+			starpu_data_release(x_handle[j]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				+			nodes_p = atoi(argv[++i]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
			
 
				+		{
			
 
				+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-p node_grid_width] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	parse_common_args(argc, argv);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int worldsize, ret;
			
 
				+	double start, end;
			
 
				+
			
 
				+	/* Not supported yet */
			
 
				+	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
			
 
				+		return 77;
			
 
				+
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	if (worldsize % nodes_p != 0)
			
 
				+	{
			
 
				+		FPRINTF_SERVER(stderr, "Node grid (%d) width must divide the number of nodes (%d).\n", nodes_p, worldsize);
			
 
				+		starpu_mpi_shutdown();
			
 
				+		return 1;
			
 
				+	}
			
 
				+	nodes_q = worldsize / nodes_p;
			
 
				+
			
 
				+	if (n % nblocks != 0)
			
 
				+	{
			
 
				+		FPRINTF_SERVER(stderr, "The number of blocks (%d) must divide the matrix size (%lld).\n", nblocks, n);
			
 
				+		starpu_mpi_shutdown();
			
 
				+		return 1;
			
 
				+	}
			
 
				+	block_size = n / nblocks;
			
 
				+
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	FPRINTF_SERVER(stderr, "************** PARAMETERS ***************\n");
			
 
				+	FPRINTF_SERVER(stderr, "%d nodes (%dx%d)\n", worldsize, nodes_p, nodes_q);
			
 
				+	FPRINTF_SERVER(stderr, "Problem size (-n): %lld\n", n);
			
 
				+	FPRINTF_SERVER(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
			
 
				+	FPRINTF_SERVER(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
			
 
				+	FPRINTF_SERVER(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	start = starpu_timing_now();
			
 
				+	generate_random_problem();
			
 
				+	register_data();
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/10e6);
			
 
				+
			
 
				+	ret = cg();
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		ret = 77;
			
 
				+		goto enodev;
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	if (display_result)
			
 
				+	{
			
 
				+		display_x_result();
			
 
				+	}
			
 
				+
			
 
				+enodev:
			
 
				+	unregister_data();
			
 
				+	free_data();
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	return ret;
			
 
				+}
			
--- a/mpi/examples/mpi_redux/mpi_redux.c
+++ b/mpi/examples/mpi_redux/mpi_redux.c
@@ -0,0 +1,201 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example illustrates how to use the STARPU_MPI_REDUX mode
			
 
				+ * and compare it with the standard STARPU_REDUX.
			
 
				+ *
			
 
				+ * In order to make this comparison salliant, the init codelet is not
			
 
				+ * a task that set the handle to a neutral element but rather depends
			
 
				+ * on the working node.
			
 
				+ * This is not a proper way to use a reduction pattern however it
			
 
				+ * can be analogous to the cost/weight of each contribution.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+static void cl_cpu_work(void *handles[], void*arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
			
 
				+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
			
 
				+	sleep(2);
			
 
				+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
			
 
				+	*a = 3.0 + *a + *b;
			
 
				+	printf("%f\n",*a);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet work_cl =
			
 
				+{
			
 
				+	.cpu_funcs = { cl_cpu_work },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_REDUX, STARPU_R },
			
 
				+	.name = "task_init"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet mpi_work_cl =
			
 
				+{
			
 
				+	.cpu_funcs = { cl_cpu_work },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
			
 
				+	.name = "task_init-mpi"
			
 
				+};
			
 
				+
			
 
				+static void cl_cpu_task_init(void *handles[], void*arg)
			
 
				+{
			
 
				+	(void) arg;
			
 
				+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
			
 
				+	sleep(1);
			
 
				+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), *a);
			
 
				+	*a = starpu_mpi_world_rank();
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet task_init_cl =
			
 
				+{
			
 
				+	.cpu_funcs = { cl_cpu_task_init },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_W },
			
 
				+	.name = "task_init"
			
 
				+};
			
 
				+
			
 
				+static void cl_cpu_task_red(void *handles[], void*arg)
			
 
				+{
			
 
				+	(void) arg;
			
 
				+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
			
 
				+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
			
 
				+	sleep(2);
			
 
				+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
			
 
				+	*ad = *ad + *as;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet task_red_cl =
			
 
				+{
			
 
				+	.cpu_funcs = { cl_cpu_task_red },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_RW, STARPU_R },
			
 
				+	.name = "task_red"
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	int comm_rank, comm_size;
			
 
				+	/* Initializes STarPU and the StarPU-MPI layer */
			
 
				+	starpu_fxt_autostart_profiling(0);
			
 
				+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
			
 
				+
			
 
				+	int nworkers = starpu_cpu_worker_get_count();
			
 
				+	if (nworkers < 2)
			
 
				+	{
			
 
				+        	FPRINTF(stderr, "We need at least 2 CPU worker per node.\n");
			
 
				+        	starpu_mpi_shutdown();
			
 
				+       		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
			
 
				+	if (comm_size < 2)
			
 
				+	{
			
 
				+        	FPRINTF(stderr, "We need at least 2 nodes.\n");
			
 
				+        	starpu_mpi_shutdown();
			
 
				+       		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
			
 
				+
			
 
				+	double a, b[comm_size];
			
 
				+	starpu_data_handle_t a_h, b_h[comm_size];
			
 
				+	double work_coef = 2;
			
 
				+	enum starpu_data_access_mode codelet_mode;
			
 
				+	enum starpu_data_access_mode task_mode;
			
 
				+	int i,j,work_node;
			
 
				+    	starpu_mpi_tag_t tag = 0;
			
 
				+	for (i = 0 ; i < 2 ; i++)
			
 
				+	{
			
 
				+		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+		if (i==0)
			
 
				+			task_mode = STARPU_MPI_REDUX;
			
 
				+		else
			
 
				+			task_mode = STARPU_REDUX;
			
 
				+		if (comm_rank == 0)
			
 
				+		{
			
 
				+			a = 1.0;
			
 
				+			printf("init a = %f\n", a);
			
 
				+			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
			
 
				+			for (j=0;j<comm_size;j++)
			
 
				+				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			b[comm_rank] = 1.0 / (comm_rank + 1.0);
			
 
				+			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
			
 
				+			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
			
 
				+			for (j=0;j<comm_size;j++)
			
 
				+			{
			
 
				+				if (j == comm_rank)
			
 
				+					starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
			
 
				+				else
			
 
				+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
			
 
				+			}
			
 
				+		}
			
 
				+		starpu_mpi_data_register(a_h, tag++, 0);
			
 
				+		for (j=0;j<comm_size;j++)
			
 
				+			starpu_mpi_data_register(b_h[j], tag++, j);
			
 
				+
			
 
				+		starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
			
 
				+		starpu_fxt_start_profiling();
			
 
				+		for (work_node=1; work_node < comm_size;work_node++)
			
 
				+		{
			
 
				+			for (j=1;j<=work_coef*nworkers;j++)
			
 
				+			{
			
 
				+				if (i == 0)
			
 
				+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+					&mpi_work_cl,
			
 
				+					task_mode, a_h,
			
 
				+					STARPU_R, b_h[work_node],
			
 
				+					STARPU_EXECUTE_ON_NODE, work_node,
			
 
				+					0);
			
 
				+				else
			
 
				+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+					&work_cl,
			
 
				+					task_mode, a_h,
			
 
				+					STARPU_R, b_h[work_node],
			
 
				+					STARPU_EXECUTE_ON_NODE, work_node,
			
 
				+					0);
			
 
				+			}
			
 
				+		}
			
 
				+		starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
			
 
				+		starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+		if (comm_rank == 0)
			
 
				+		{
			
 
				+			double tmp = 0.0;
			
 
				+			for (work_node = 1; work_node < comm_size ; work_node++)
			
 
				+				tmp += 1.0 / (work_node + 1.0);
			
 
				+			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
			
 
				+		}
			
 
				+		starpu_data_unregister(a_h);
			
 
				+		for (work_node=0; work_node < comm_size;work_node++)
			
 
				+			starpu_data_unregister(b_h[work_node]);
			
 
				+		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	}
			
 
				+	starpu_mpi_shutdown();
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/examples/native_fortran/nf_mpi_redux.f90
+++ b/mpi/examples/native_fortran/nf_mpi_redux.f90
@@ -0,0 +1,253 @@
 
				+! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+!
			
 
				+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+!
			
 
				+! StarPU is free software; you can redistribute it and/or modify
			
 
				+! it under the terms of the GNU Lesser General Public License as published by
			
 
				+! the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+! your option) any later version.
			
 
				+!
			
 
				+! StarPU is distributed in the hope that it will be useful, but
			
 
				+! WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+!
			
 
				+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+!
			
 
				+program nf_mpi_redux
			
 
				+  use iso_c_binding
			
 
				+  use fstarpu_mod
			
 
				+  use fstarpu_mpi_mod
			
 
				+
			
 
				+  implicit none
			
 
				+
			
 
				+  integer, target                         :: ret, np, i, j, trial
			
 
				+  type(c_ptr)                             :: work_cl, task_rw_cl,task_red_cl, task_ini_cl
			
 
				+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
			
 
				+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
			
 
				+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
			
 
				+  real(kind(1.d0)), target                :: a,tmp
			
 
				+  real(kind(1.d0)), target, allocatable   :: b(:)
			
 
				+  integer(kind=8)                         :: tag, err
			
 
				+  type(c_ptr)                             :: ahdl
			
 
				+  type(c_ptr), target, allocatable        :: bhdl(:)
			
 
				+  type(c_ptr)                             :: task_mode, codelet_mode
			
 
				+  integer, target                         :: comm_world,comm_w_rank, comm_size
			
 
				+  integer(c_int), target                  :: w_node, nworkers, work_coef
			
 
				+
			
 
				+  call fstarpu_fxt_autostart_profiling(0)
			
 
				+  ret = fstarpu_init(c_null_ptr)
			
 
				+  ret = fstarpu_mpi_init(1)
			
 
				+
			
 
				+  comm_world = fstarpu_mpi_world_comm()
			
 
				+  comm_w_rank  = fstarpu_mpi_world_rank()
			
 
				+  comm_size  = fstarpu_mpi_world_size()
			
 
				+  if (comm_size.lt.2) then
			
 
				+    write(*,'(" ")')
			
 
				+    write(*,'("This application is meant to run with at least two nodes.")')
			
 
				+    stop 2
			
 
				+  end if
			
 
				+  allocate(b(comm_size-1), bhdl(comm_size-1))
			
 
				+  nworkers = fstarpu_worker_get_count()
			
 
				+  if (nworkers.lt.1) then
			
 
				+    write(*,'(" ")')
			
 
				+    write(*,'("This application is meant to run with at least one worker per node.")')
			
 
				+    stop 2
			
 
				+  end if
			
 
				+
			
 
				+  ! allocate and reduction codelets
			
 
				+  task_red_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(task_red_cl, namered)
			
 
				+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
			
 
				+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
			
 
				+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
			
 
				+
			
 
				+  task_ini_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
			
 
				+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
			
 
				+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
			
 
				+
			
 
				+  work_coef=2
			
 
				+
			
 
				+  do trial=1,2
			
 
				+
			
 
				+  if (trial.eq.1) then
			
 
				+        write(*,*) "Using STARPU_MPI_REDUX"
			
 
				+        codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
			
 
				+        task_mode = FSTARPU_MPI_REDUX
			
 
				+  else if (trial.eq.2) then
			
 
				+        write(*,*) "Using STARPU_REDUX"
			
 
				+        codelet_mode = FSTARPU_REDUX
			
 
				+        task_mode = FSTARPU_REDUX
			
 
				+  end if
			
 
				+  ! allocate and fill codelet structs
			
 
				+  work_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(work_cl, name)
			
 
				+  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
			
 
				+  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
			
 
				+  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+    write(*,'(" ")')
			
 
				+    a = 1.0
			
 
				+    write(*,*) "init a = ", a
			
 
				+  else
			
 
				+    b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
			
 
				+    write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank), " AT ", &
			
 
				+c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
			
 
				+  end if
			
 
				+
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+  tag = 0
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+    call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
			
 
				+    do i=1,comm_size-1
			
 
				+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
			
 
				+    end do
			
 
				+  else
			
 
				+    call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
			
 
				+    do i=1,comm_size-1
			
 
				+      if (i.eq.comm_w_rank) then
			
 
				+        call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
			
 
				+      else
			
 
				+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
			
 
				+      end if
			
 
				+    end do
			
 
				+  end if
			
 
				+  call fstarpu_mpi_data_register(ahdl,  tag,  0)
			
 
				+  do i=1,comm_size-1
			
 
				+     call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
			
 
				+  end do
			
 
				+
			
 
				+  tag = tag + comm_size
			
 
				+
			
 
				+  call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
			
 
				+
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+
			
 
				+  call fstarpu_fxt_start_profiling()
			
 
				+  do w_node=1,comm_size-1
			
 
				+    do i=1,work_coef*nworkers
			
 
				+      call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
			
 
				+             work_cl,                                         &
			
 
				+             task_mode, ahdl,                            &
			
 
				+             FSTARPU_R, bhdl(w_node),                      &
			
 
				+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
			
 
				+             C_NULL_PTR /))
			
 
				+    end do
			
 
				+  end do
			
 
				+  call fstarpu_mpi_redux_data(comm_world, ahdl)
			
 
				+  err = fstarpu_mpi_wait_for_all(comm_world)
			
 
				+
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+    tmp = 0
			
 
				+    do w_node=1,comm_size-1
			
 
				+      tmp = tmp + 1.0 / (w_node+1.0)
			
 
				+    end do
			
 
				+    write(*,*) 'computed result ---> ',a, "expected =",&
			
 
				+      1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
			
 
				+  end if
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+  call fstarpu_data_unregister(ahdl)
			
 
				+  do w_node=1,comm_size-1
			
 
				+    call fstarpu_data_unregister(bhdl(w_node))
			
 
				+  end do
			
 
				+  call fstarpu_codelet_free(work_cl)
			
 
				+
			
 
				+  end do
			
 
				+
			
 
				+  call fstarpu_fxt_stop_profiling()
			
 
				+  call fstarpu_codelet_free(task_red_cl)
			
 
				+  call fstarpu_codelet_free(task_ini_cl)
			
 
				+
			
 
				+
			
 
				+  err = fstarpu_mpi_shutdown()
			
 
				+  call fstarpu_shutdown()
			
 
				+  deallocate(b, bhdl)
			
 
				+  stop
			
 
				+
			
 
				+contains
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
			
 
				+    integer(c_int) :: ret, worker_id
			
 
				+    integer        :: comm_rank
			
 
				+    integer, target :: i
			
 
				+    real(kind(1.d0)), pointer :: a, b
			
 
				+    real(kind(1.d0))          :: old_a
			
 
				+
			
 
				+    worker_id = fstarpu_worker_get_id()
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
			
 
				+    call nf_sleep(1.d0)
			
 
				+    old_a = a
			
 
				+    a = old_a + 3.0 + b
			
 
				+    write(*,*) "task   (c_w_rank:",comm_rank," worker_id:",worker_id,") from ",old_a,"to",a
			
 
				+
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
			
 
				+    integer(c_int) :: ret, worker_id
			
 
				+    integer, target                         :: comm_rank
			
 
				+    real(kind(1.d0)), pointer :: as, ad
			
 
				+    real(kind(1.d0))           :: old_ad
			
 
				+    worker_id = fstarpu_worker_get_id()
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
			
 
				+    old_ad = ad
			
 
				+    ad = ad + as
			
 
				+    call nf_sleep(1.d0)
			
 
				+    write(*,*) "red_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,")",as, old_ad, ' ---> ',ad
			
 
				+
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task_red
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args
			
 
				+        ! cl_args is unused
			
 
				+    integer(c_int) :: ret, worker_id
			
 
				+    integer, target                         :: comm_rank
			
 
				+    real(kind(1.d0)), pointer :: a
			
 
				+    worker_id = fstarpu_worker_get_id()
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
			
 
				+    call nf_sleep(0.5d0)
			
 
				+    ! As this codelet is run by each worker in the REDUX mode case
			
 
				+    ! this initialization makes salient the number of copies spawned
			
 
				+    write(*,*) "ini_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,") set to", comm_rank, "(was",a,")"
			
 
				+    a = comm_rank
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task_ini
			
 
				+
			
 
				+  subroutine nf_sleep(t)
			
 
				+    implicit none
			
 
				+    integer :: t_start, t_end, t_rate
			
 
				+    real(kind(1.d0))     :: ta, t
			
 
				+    call system_clock(t_start)
			
 
				+    do
			
 
				+       call system_clock(t_end, t_rate)
			
 
				+       ta = real(t_end-t_start)/real(t_rate)
			
 
				+       if(ta.gt.t) return
			
 
				+    end do
			
 
				+  end subroutine nf_sleep
			
 
				+
			
 
				+end program
			
--- a/mpi/examples/native_fortran/nf_redux_test.f90
+++ b/mpi/examples/native_fortran/nf_redux_test.f90
@@ -0,0 +1,238 @@
 
				+! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+!
			
 
				+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+!
			
 
				+! StarPU is free software; you can redistribute it and/or modify
			
 
				+! it under the terms of the GNU Lesser General Public License as published by
			
 
				+! the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+! your option) any later version.
			
 
				+!
			
 
				+! StarPU is distributed in the hope that it will be useful, but
			
 
				+! WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+!
			
 
				+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+!
			
 
				+program main
			
 
				+  use iso_c_binding
			
 
				+  use fstarpu_mod
			
 
				+  use fstarpu_mpi_mod
			
 
				+
			
 
				+  implicit none
			
 
				+
			
 
				+  integer, target                         :: ret, np, i, j
			
 
				+  type(c_ptr)                             :: task_cl, task_rw_cl, task_red_cl, task_ini_cl
			
 
				+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
			
 
				+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
			
 
				+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
			
 
				+  real(kind(1.d0)), target                :: a1, a2, b1, b2
			
 
				+  integer(kind=8)                          :: tag, err
			
 
				+  type(c_ptr)                             :: a1hdl, a2hdl, b1hdl, b2hdl
			
 
				+  integer, target                         :: comm, comm_world, comm_w_rank, comm_size
			
 
				+  integer(c_int), target                  :: w_node
			
 
				+
			
 
				+  call fstarpu_fxt_autostart_profiling(0)
			
 
				+  ret = fstarpu_init(c_null_ptr)
			
 
				+  ret = fstarpu_mpi_init(1)
			
 
				+
			
 
				+  comm_world = fstarpu_mpi_world_comm()
			
 
				+  comm_w_rank  = fstarpu_mpi_world_rank()
			
 
				+  comm_size  = fstarpu_mpi_world_size()
			
 
				+  if (comm_size.ne.4) then
			
 
				+    write(*,'(" ")')
			
 
				+    write(*,'("This application is meant to run with 4 MPI")')
			
 
				+    stop 1
			
 
				+  end if
			
 
				+  err   = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+    write(*,'(" ")')
			
 
				+    a1 = 1.0
			
 
				+    write(*,*) "init_a1", a1
			
 
				+    b1 = 0.5
			
 
				+    write(*,*) "init b1", b1
			
 
				+  end if
			
 
				+  if(comm_w_rank.eq.1) then
			
 
				+    write(*,'(" ")')
			
 
				+    a2 = 2.0
			
 
				+    write(*,*) "init_a2", a2
			
 
				+    b2 = 0.8
			
 
				+    write(*,*) "init b2", b2
			
 
				+  end if
			
 
				+
			
 
				+  ! allocate and fill codelet structs
			
 
				+  task_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(task_cl, name)
			
 
				+  call fstarpu_codelet_add_cpu_func(task_cl, C_FUNLOC(cl_cpu_task))
			
 
				+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_REDUX)
			
 
				+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_R)
			
 
				+
			
 
				+  ! allocate and reduction codelets
			
 
				+  task_red_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(task_red_cl, namered)
			
 
				+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
			
 
				+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
			
 
				+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
			
 
				+
			
 
				+  task_ini_cl = fstarpu_codelet_allocate()
			
 
				+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
			
 
				+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
			
 
				+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
			
 
				+
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+  tag = 0
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+        call fstarpu_variable_data_register(a1hdl, 0, c_loc(a1),c_sizeof(a1))
			
 
				+        call fstarpu_variable_data_register(b1hdl, 0, c_loc(b1),c_sizeof(b1))
			
 
				+  else
			
 
				+        call fstarpu_variable_data_register(a1hdl, -1, c_null_ptr,c_sizeof(a1))
			
 
				+        call fstarpu_variable_data_register(b1hdl, -1, c_null_ptr,c_sizeof(b1))
			
 
				+  end if
			
 
				+  call fstarpu_mpi_data_register(a1hdl,tag,0)
			
 
				+  call fstarpu_mpi_data_register(b1hdl, tag+1,0)
			
 
				+
			
 
				+  tag = tag + 2
			
 
				+  if(comm_w_rank.eq.1) then
			
 
				+        call fstarpu_variable_data_register(a2hdl, 0, c_loc(a2),c_sizeof(a2))
			
 
				+        call fstarpu_variable_data_register(b2hdl, 0, c_loc(b2),c_sizeof(b2))
			
 
				+  else
			
 
				+        call fstarpu_variable_data_register(a2hdl, -1, c_null_ptr,c_sizeof(a2))
			
 
				+        call fstarpu_variable_data_register(b2hdl, -1, c_null_ptr,c_sizeof(b2))
			
 
				+  end if
			
 
				+  call fstarpu_mpi_data_register(a2hdl,tag,1)
			
 
				+  call fstarpu_mpi_data_register(b2hdl, tag+1, 1)
			
 
				+  tag = tag + 2
			
 
				+
			
 
				+  call fstarpu_data_set_reduction_methods(a1hdl, task_red_cl,task_ini_cl)
			
 
				+  call fstarpu_data_set_reduction_methods(a2hdl, task_red_cl,task_ini_cl)
			
 
				+
			
 
				+  err = fstarpu_mpi_barrier(comm_world)
			
 
				+
			
 
				+  call fstarpu_fxt_start_profiling()
			
 
				+
			
 
				+  w_node = 3
			
 
				+  comm = comm_world
			
 
				+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
			
 
				+             task_cl,                                         &
			
 
				+             FSTARPU_REDUX, a1hdl,                            &
			
 
				+             FSTARPU_R, b1hdl,                                &
			
 
				+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
			
 
				+             C_NULL_PTR /))
			
 
				+  w_node = 2
			
 
				+  comm = comm_world
			
 
				+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
			
 
				+             task_cl,                                         &
			
 
				+             FSTARPU_REDUX, a2hdl,                            &
			
 
				+             FSTARPU_R, b2hdl,                                &
			
 
				+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
			
 
				+             C_NULL_PTR /))
			
 
				+
			
 
				+  call fstarpu_mpi_redux_data(comm_world, a1hdl)
			
 
				+  call fstarpu_mpi_redux_data(comm_world, a2hdl)
			
 
				+  ! write(*,*) "waiting all tasks ..."
			
 
				+  err = fstarpu_mpi_wait_for_all(comm_world)
			
 
				+
			
 
				+  if(comm_w_rank.eq.0) then
			
 
				+     write(*,*) 'computed result ---> ',a1, "expected =",4.5
			
 
				+  end if
			
 
				+  if(comm_w_rank.eq.1) then
			
 
				+     write(*,*) 'computed result ---> ',a2, "expected=",5.8
			
 
				+  end if
			
 
				+  call fstarpu_data_unregister(a1hdl)
			
 
				+  call fstarpu_data_unregister(a2hdl)
			
 
				+  call fstarpu_data_unregister(b1hdl)
			
 
				+  call fstarpu_data_unregister(b2hdl)
			
 
				+
			
 
				+  call fstarpu_fxt_stop_profiling()
			
 
				+  call fstarpu_codelet_free(task_cl)
			
 
				+  call fstarpu_codelet_free(task_red_cl)
			
 
				+  call fstarpu_codelet_free(task_ini_cl)
			
 
				+
			
 
				+
			
 
				+  err = fstarpu_mpi_shutdown()
			
 
				+  call fstarpu_shutdown()
			
 
				+
			
 
				+  stop
			
 
				+
			
 
				+contains
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
			
 
				+    integer(c_int) :: ret, worker_id
			
 
				+    integer        :: comm_rank
			
 
				+    integer, target :: i
			
 
				+    real(kind(1.d0)), pointer :: a, b
			
 
				+    real(kind(1.d0))          :: old_a
			
 
				+
			
 
				+    worker_id = fstarpu_worker_get_id()
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
			
 
				+    call nf_sleep(1.d0)
			
 
				+    old_a = a
			
 
				+    a = 3.0 + b
			
 
				+    write(*,*) "task   (c_w_rank:",comm_rank,") from ",old_a,"to",a
			
 
				+
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
			
 
				+    integer(c_int) :: ret
			
 
				+    integer, target                         :: comm_rank
			
 
				+    real(kind(1.d0)), pointer :: as, ad
			
 
				+    real(kind(1.d0))           :: old_ad
			
 
				+
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
			
 
				+    old_ad = ad
			
 
				+    ad = ad + as
			
 
				+    call nf_sleep(1.d0)
			
 
				+    write(*,*) "red_cl (c_w_rank:",comm_rank,")",as, old_ad, ' ---> ',ad
			
 
				+
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task_red
			
 
				+
			
 
				+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
			
 
				+    use iso_c_binding       ! C interfacing module
			
 
				+    use fstarpu_mod         ! StarPU interfacing module
			
 
				+    implicit none
			
 
				+
			
 
				+    type(c_ptr), value, intent(in) :: buffers, cl_args
			
 
				+        ! cl_args is unused
			
 
				+    integer(c_int) :: ret
			
 
				+    integer, target                         :: comm_rank
			
 
				+    real(kind(1.d0)), pointer :: a
			
 
				+
			
 
				+    comm_rank  = fstarpu_mpi_world_rank()
			
 
				+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
			
 
				+    call nf_sleep(0.5d0)
			
 
				+    a = 0.0
			
 
				+    write(*,*) "ini_cl (c_w_rank:",comm_rank,")"
			
 
				+    return
			
 
				+  end subroutine cl_cpu_task_ini
			
 
				+
			
 
				+  subroutine nf_sleep(t)
			
 
				+    implicit none
			
 
				+    integer :: t_start, t_end, t_rate
			
 
				+    real(kind(1.d0))     :: ta, t
			
 
				+    call system_clock(t_start)
			
 
				+    do
			
 
				+       call system_clock(t_end, t_rate)
			
 
				+       ta = real(t_end-t_start)/real(t_rate)
			
 
				+       if(ta.gt.t) return
			
 
				+    end do
			
 
				+  end subroutine nf_sleep
			
 
				+
			
 
				+end program main
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -232,6 +232,11 @@ int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, s
 
				 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
			
 
				 
			
 
				 /**
			
 
				+   Same of starpu_mpi_irecv_detached but with the \p prio parameter.
			
 
				+*/
			
 
				+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
			
 
				+
			
 
				+/**
			
 
				    Post a nonblocking receive in \p data_handle from the node \p
			
 
				    source using the message tag \p data_tag within the communicator \p
			
 
				    comm. On completion, the \p callback function is called with the
			
@@ -561,6 +566,10 @@ int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
 
				    Return the tag of the given data.
			
 
				 */
			
 
				 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
			
 
				+/**
			
 
				+   Return the redux map of the given data.
			
 
				+*/
			
 
				+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t handle);
			
 
				 
			
 
				 /**
			
 
				    Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()
			
--- a/mpi/src/mpi/starpu_mpi_early_data.h
+++ b/mpi/src/mpi/starpu_mpi_early_data.h
@@ -40,7 +40,6 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 
				 	  void *buffer;
			
 
				 	  size_t size;
			
 
				 	  unsigned buffer_node;
			
 
				-	  int req_ready;
			
 
				 	  struct _starpu_mpi_node_tag node_tag;
			
 
				 	  starpu_pthread_mutex_t req_mutex;
			
 
				 	  starpu_pthread_cond_t req_cond;
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -50,6 +50,9 @@ static unsigned nready_process;
 
				 /* Number of send requests to submit to MPI at the same time */
			
 
				 static unsigned ndetached_send;
			
 
				 
			
 
				+/* Force allocation of early data */
			
 
				+static int early_data_force_allocate;
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 static void _starpu_mpi_add_sync_point_in_fxt(void);
			
 
				 #endif
			
@@ -81,6 +84,11 @@ static starpu_pthread_t progress_thread;
 
				 #endif
			
 
				 static int running = 0;
			
 
				 
			
 
				+/* Provides synchronization between an early request, a sync request, and an early data handle:
			
 
				+ * we keep it held while checking and posting one to prevent the other.
			
 
				+ * This is to be taken always before the progress_mutex. */
			
 
				+static starpu_pthread_mutex_t early_data_mutex;
			
 
				+
			
 
				 /* Driver taken by StarPU-MPI to process tasks when there is no requests to
			
 
				  * handle instead of polling endlessly */
			
 
				 static struct starpu_driver *mpi_driver = NULL;
			
@@ -103,7 +111,7 @@ static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
			
 
				 
			
 
				-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
			
 
				+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #pragma weak smpi_simulated_main_
			
@@ -182,8 +190,6 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 
			
 
				 	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				-
			
 
				 	if (req->request_type == RECV_REQ)
			
 
				 	{
			
 
				 		/* Case : the request is the internal receive request submitted
			
@@ -206,6 +212,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
			
 
				 			}
			
 
				 
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
			
 
				 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
			
 
				 					  req->datatype_name, (int)req->count, req->registered_datatype);
			
@@ -213,31 +220,24 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 			_STARPU_MPI_INC_READY_REQUESTS(+1);
			
 
				 
			
 
				 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
			
 
				-			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->posted_mutex);
			
 
				 			req->posted = 1;
			
 
				 			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
			
 
				-			STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->posted_mutex);
			
 
				-			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
			
 
				 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
			
 
				 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
			
 
				 
			
 
				 			if (early_data_handle)
			
 
				 			{
			
 
				+				/* Got the early_data_handle */
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				+
			
 
				 				/* Case: a receive request for a data with the given tag and source has already been
			
 
				 				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
			
 
				 				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
			
 
				 				 * will be called to bring the data back to the original data handle associated to the request.*/
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
			
 
				-				while (!(early_data_handle->req_ready))
			
 
				-					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				-
			
 
				 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
			
 
				 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
			
 
				 
			
@@ -254,9 +254,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 				cb_args->req = req;
			
 
				 
			
 
				 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
			
 
				-				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL);
			
 
				+				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL, req->prio);
			
 
				 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 			}
			
 
				 			else
			
@@ -265,6 +264,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
			
 
				 				if (sync_req)
			
 
				 				{
			
 
				+					/* Got the sync req */
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				 					/* Case: we already received the send envelope, we can proceed with the receive */
			
 
				 					req->sync = 1;
			
 
				 					_starpu_mpi_datatype_allocate(req->data_handle, req);
			
@@ -279,6 +280,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 						STARPU_ASSERT(req->count);
			
 
				 						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
			
 
				 					}
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
			
 
				 					_STARPU_MPI_INC_READY_REQUESTS(+1);
			
 
				 					/* Throw away the dumb request that was only used to know that we got the envelope */
			
@@ -288,13 +290,17 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
				 				{
			
 
				 					/* Case: no matching data has been received. Store the receive request as an early_request. */
			
 
				 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 					_starpu_mpi_early_request_enqueue(req);
			
 
				+					/* We have queued our early request, we can let the progression thread look at it */
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 		if (req->request_type == SEND_REQ)
			
 
				 			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
			
 
				 		else
			
@@ -1157,13 +1163,11 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
				 	_starpu_mpi_early_data_add(early_data_handle);
			
 
				 
			
 
				 	starpu_data_handle_t data_handle;
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 
			
 
				 	// TODO: rather select some memory node next to the NIC
			
 
				 	unsigned buffer_node = STARPU_MAIN_RAM;
			
 
				-	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID)
			
 
				+	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID && !early_data_force_allocate)
			
 
				 	{
			
 
				 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
			
 
				 		early_data_handle->buffer = NULL;
			
@@ -1190,25 +1194,16 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
			
 
				 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
			
 
				-							  NULL, NULL, 1, 1, envelope->size);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				+							  NULL, NULL, 1, 1, envelope->size, STARPU_DEFAULT_PRIO);
			
 
				+	/* The early data handle is ready, we can let _starpu_mpi_submit_ready_request
			
 
				+	 * proceed with acquiring it */
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				 
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 	// We wait until the request is pushed in the
			
 
				 	// ready_request list
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->backend->posted_mutex));
			
 
				 	while (!(early_data_handle->req->posted))
			
 
				-		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &(early_data_handle->req->backend->posted_mutex));
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->backend->posted_mutex));
			
 
				-
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning check if req_ready is still necessary
			
 
				-#endif
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
			
 
				-	early_data_handle->req_ready = 1;
			
 
				-	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				+		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &progress_mutex);
			
 
				 
			
 
				 	// Handle the request immediatly to make sure the mpi_irecv is
			
 
				 	// posted before receiving an other envelope
			
@@ -1421,6 +1416,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 				{
			
 
				 					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
			
 
				 
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
			
 
				 
			
 
				 					/* Case: a data will arrive before a matching receive is
			
@@ -1453,9 +1451,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 							new_req->backend->is_internal_req = 0; // ????
			
 
				 							new_req->count = envelope->size;
			
 
				 							_starpu_mpi_sync_data_add(new_req);
			
 
				+							/* We have queued our sync request, we can let _starpu_mpi_submit_ready_request find it */
			
 
				+							STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				 						}
			
 
				 						else
			
 
				 						{
			
 
				+							/* This will release early_data_mutex when appropriate */
			
 
				 							_starpu_mpi_receive_early_data(envelope, envelope_status, envelope_comm);
			
 
				 						}
			
 
				 					}
			
@@ -1466,6 +1467,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					 * _starpu_mpi_handle_ready_request. */
			
 
				 					else
			
 
				 					{
			
 
				+						/* Got the early request */
			
 
				+						STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
			
 
				 						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
			
 
				 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
			
 
				 
			
@@ -1621,6 +1624,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
				 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
			
 
				 {
			
 
				         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
			
 
				+        STARPU_PTHREAD_MUTEX_INIT(&early_data_mutex, NULL);
			
 
				         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
			
 
				         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
			
 
				 	_starpu_mpi_req_list_init(&ready_recv_requests);
			
@@ -1634,6 +1638,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				 
			
 
				 	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
			
 
				 	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
			
 
				+	early_data_force_allocate = starpu_get_env_number_default("STARPU_MPI_EARLYDATA_ALLOCATE", 0);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
			
@@ -1688,6 +1693,7 @@ void _starpu_mpi_progress_shutdown(void **value)
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
			
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_ready_requests);
			
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
			
 
				+        STARPU_PTHREAD_MUTEX_DESTROY(&early_data_mutex);
			
 
				         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
			
 
				 }
			
 
				 
			
--- a/mpi/src/mpi/starpu_mpi_mpi_backend.c
+++ b/mpi/src/mpi/starpu_mpi_mpi_backend.c
@@ -54,7 +54,6 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
			
 
				 	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
			
 
				-	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->posted_mutex, NULL);
			
 
				 	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
			
 
				 
			
 
				 	//req->backend->other_request = NULL;
			
@@ -80,7 +79,6 @@ void _starpu_mpi_mpi_backend_request_destroy(struct _starpu_mpi_req *req)
 
				 {
			
 
				 	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
			
 
				 	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
			
 
				-	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->posted_mutex);
			
 
				 	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
			
 
				 	free(req->backend);
			
 
				 	req->backend = NULL;
			
--- a/mpi/src/mpi/starpu_mpi_mpi_backend.h
+++ b/mpi/src/mpi/starpu_mpi_mpi_backend.h
@@ -54,7 +54,6 @@ struct _starpu_mpi_req_backend
 
				 
			
 
				 	starpu_pthread_mutex_t req_mutex;
			
 
				 	starpu_pthread_cond_t req_cond;
			
 
				-	starpu_pthread_mutex_t posted_mutex;
			
 
				 	starpu_pthread_cond_t posted_cond;
			
 
				 	/** In the case of a Wait/Test request, we are going to post a request
			
 
				 	 * to test the completion of another request */
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -161,12 +161,12 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 
				 
			
 
				 	if (sequential_consistency)
			
 
				 	{
			
 
				-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
			
 
				+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		/* post_sync_job_id has already been filled */
			
 
				-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
			
 
				+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -289,7 +289,7 @@ int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starp
 
				 	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
			
 
				 }
			
 
				 
			
 
				-struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
			
 
				+struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
			
 
				 {
			
 
				 	if (_starpu_mpi_fake_world_size != -1)
			
 
				 	{
			
@@ -297,7 +297,7 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
 
				-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
			
 
				+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
			
 
				 	_starpu_mpi_req_willpost(req);
			
 
				 
			
 
				 	if (sequential_consistency == 0)
			
@@ -317,7 +317,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
				 
			
 
				 	struct _starpu_mpi_req *req;
			
 
				 	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
			
 
				-	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
			
 
				+	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
			
 
				 	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
			
 
				 
			
 
				 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
			
@@ -331,7 +331,17 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, star
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
			
 
				+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				+{
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
			
 
				+
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return 0;
			
 
				 }
			
@@ -340,7 +350,7 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
			
 
				+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return 0;
			
@@ -379,10 +389,13 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
				 
			
 
				 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				+	struct _starpu_mpi_data *data = data_handle->mpi_data;
			
 
				 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
			
 
				 	_starpu_mpi_cache_data_clear(data_handle);
			
 
				-	_starpu_spin_destroy(&((struct _starpu_mpi_data*) data_handle->mpi_data)->coop_lock);
			
 
				-	free(data_handle->mpi_data);
			
 
				+	_starpu_spin_destroy(&data->coop_lock);
			
 
				+	if (data->redux_map != REDUX_CONTRIB)
			
 
				+		free(data->redux_map);
			
 
				+	free(data);
			
 
				 	data_handle->mpi_data = NULL;
			
 
				 }
			
 
				 
			
@@ -448,6 +461,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
				 	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
			
 
				 }
			
 
				 
			
 
				+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
			
 
				+	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
			
 
				+}
			
 
				+
			
 
				 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
 
				 {
			
 
				 	int me, rank;
			
--- a/mpi/src/starpu_mpi_coop_sends.c
+++ b/mpi/src/starpu_mpi_coop_sends.c
@@ -297,8 +297,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 
				 
			
 
				 	if (first)
			
 
				 		/* We were first, we are responsible for acquiring the data for everybody */
			
 
				-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
			
 
				+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL, req->prio);
			
 
				 	else
			
 
				 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
			
 
				 }
			
 
				-
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -118,7 +118,7 @@ int _starpu_debug_rank;
 
				 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
			
 
				 			fflush(stderr);	\
			
 
				 		} \
			
 
				-	} while(0);
			
 
				+	} while(0)
			
 
				 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
			
 
				 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
			
 
				 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
			
@@ -130,7 +130,7 @@ int _starpu_debug_rank;
 
				 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
			
 
				 			fflush(stderr); \
			
 
				 		} \
			
 
				-	} while(0);
			
 
				+	} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
			
 
				 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
			
@@ -141,10 +141,10 @@ int _starpu_debug_rank;
 
				 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
			
 
				 	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				                                              fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
			
 
				-                                             fflush(stderr); }} while(0);
			
 
				+                                             fflush(stderr); }} while(0)
			
 
				 #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
			
 
				-                                             fflush(stderr); } while(0);
			
 
				+                                             fflush(stderr); } while(0)
			
 
				 
			
 
				 #ifdef STARPU_MPI_EXTRA_VERBOSE
			
 
				 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
			
@@ -203,6 +203,12 @@ struct _starpu_mpi_coop_sends
 
				 	long pre_sync_jobid;
			
 
				 };
			
 
				 
			
 
				+/** cf. redux_map field : this is the value
			
 
				+ * put in this field whenever a node contributes
			
 
				+ * to the reduction of the data.
			
 
				+ * Only the owning node keeps track of all the contributing nodes. */
			
 
				+#define REDUX_CONTRIB ((char*) -1)
			
 
				+
			
 
				 /** Initialized in starpu_mpi_data_register_comm */
			
 
				 struct _starpu_mpi_data
			
 
				 {
			
@@ -211,8 +217,12 @@ struct _starpu_mpi_data
 
				 	char *cache_sent;
			
 
				 	int cache_received;
			
 
				 
			
 
				-	/** Rendez-vous data for opportunistic cooperative sends */
			
 
				-	/** Needed to synchronize between submit thread and workers */
			
 
				+	/** Array used to store the contributing nodes to this data
			
 
				+	  * when it is accessed in REDUX mode. */
			
 
				+	char* redux_map;
			
 
				+
			
 
				+	/** Rendez-vous data for opportunistic cooperative sends,
			
 
				+	  * Needed to synchronize between submit thread and workers */
			
 
				 	struct _starpu_spinlock coop_lock;
			
 
				 	/** Current cooperative send bag */
			
 
				 	struct _starpu_mpi_coop_sends *coop_sends;
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -100,7 +100,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
			
 
				 	}
			
 
				-	if (data && mode & STARPU_R)
			
 
				+	if (data && mode & STARPU_R && !(mode & STARPU_MPI_REDUX))
			
 
				 	{
			
 
				 		int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
			
@@ -118,7 +118,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 				if (data_tag == -1)
			
 
				 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
			
 
				-				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
			
 
				+				starpu_mpi_irecv_detached_prio(data, mpi_rank, data_tag, prio, comm, NULL, NULL);
			
 
				 			}
			
 
				 			// else the node has already received the data
			
 
				 		}
			
@@ -142,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 static
			
 
				 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
			
 
				 {
			
 
				-	if (mode & STARPU_W)
			
 
				+	if (mode & STARPU_W && !(mode & STARPU_MPI_REDUX))
			
 
				 	{
			
 
				 		int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
			
@@ -179,7 +179,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 {
			
 
				 	if (_starpu_cache_enabled)
			
 
				 	{
			
 
				-		if (mode & STARPU_W || mode & STARPU_REDUX)
			
 
				+		if ((mode & STARPU_W && !(mode & STARPU_MPI_REDUX)) || mode & STARPU_REDUX)
			
 
				 		{
			
 
				 			/* The data has been modified, it MUST be removed from the cache */
			
 
				 			starpu_mpi_cached_send_clear(data);
			
@@ -189,7 +189,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 	else
			
 
				 	{
			
 
				 		/* We allocated a temporary buffer for the received data, now drop it */
			
 
				-		if ((mode & STARPU_R) && do_execute)
			
 
				+		if ((mode & STARPU_R && !(mode & STARPU_MPI_REDUX)) && do_execute)
			
 
				 		{
			
 
				 			int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				 			if (mpi_rank == STARPU_MPI_PER_NODE)
			
@@ -254,7 +254,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 
				 				inconsistent_execute = 0;
			
 
				 			}
			
 
				 		}
			
 
				-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
			
 
				+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
			
 
				 		{
			
 
				 			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
			
 
				 			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
			
@@ -617,6 +617,20 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
				 
			
 
				 	for(i=0 ; i<nb_data ; i++)
			
 
				 	{
			
 
				+		if ((descrs[i].mode & STARPU_REDUX || descrs[i].mode & STARPU_MPI_REDUX) && descrs[i].handle)
			
 
				+		{
			
 
				+			struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
			
 
				+			if (me == starpu_mpi_data_get_rank(descrs[i].handle))
			
 
				+			{
			
 
				+				int size;
			
 
				+				starpu_mpi_comm_size(comm, &size);
			
 
				+				if (mpi_data->redux_map == NULL)
			
 
				+					_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
			
 
				+				mpi_data->redux_map [xrank] = 1;
			
 
				+			}
			
 
				+			else if (me == xrank)
			
 
				+				mpi_data->redux_map = REDUX_CONTRIB;
			
 
				+		}
			
 
				 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
			
 
				 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
			
 
				 	}
			
@@ -813,6 +827,11 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args,
 
				 
			
 
				 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
			
 
				  * a data previously accessed in REDUX mode gets accessed in R mode. */
			
 
				+/* FIXME: In order to prevent simultaneous receive submissions
			
 
				+ * on the same handle, we need to wait that all the starpu_mpi
			
 
				+ * tasks are done before submitting next tasks. The current
			
 
				+ * version of the implementation does not support multiple
			
 
				+ * simultaneous receive requests on the same handle.*/
			
 
				 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
			
 
				 {
			
 
				 	int me, rank, nb_nodes;
			
@@ -820,6 +839,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
				 
			
 
				 	rank = starpu_mpi_data_get_rank(data_handle);
			
 
				 	data_tag = starpu_mpi_data_get_tag(data_handle);
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 	if (rank == -1)
			
 
				 	{
			
 
				 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
@@ -832,12 +852,16 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
				 	starpu_mpi_comm_rank(comm, &me);
			
 
				 	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
			
 
				-
			
 
				+	_STARPU_MPI_DEBUG(50, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
			
 
				 	// need to count how many nodes have the data in redux mode
			
 
				 	if (me == rank)
			
 
				 	{
			
 
				-		int i;
			
 
				+		int i,j;
			
 
				+		_STARPU_MPI_DEBUG(50, "Who is in the map ?\n");
			
 
				+		for (j = 0; j<nb_nodes; j++)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(50, "%d is in the map ? %d\n", j, mpi_data->redux_map[j]);
			
 
				+		}
			
 
				 
			
 
				 		// taskC depends on all taskBs created
			
 
				 		// Creating synchronization task and use its jobid for tracing
			
@@ -848,8 +872,9 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
				 
			
 
				 		for(i=0 ; i<nb_nodes ; i++)
			
 
				 		{
			
 
				-			if (i != rank)
			
 
				+			if (i != rank && mpi_data->redux_map[i])
			
 
				 			{
			
 
				+				_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p \n", i, data_handle);
			
 
				 				/* We need to make sure all is
			
 
				 				 * executed after data_handle finished
			
 
				 				 * its last read access, we hence do
			
@@ -893,24 +918,34 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
				 						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
			
 
				 						   0);
			
 
				 			}
			
 
				+			else
			
 
				+			{
			
 
				+				_STARPU_MPI_DEBUG(5, "%d is not in the map or is me\n", i);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		int ret = starpu_task_submit(taskC);
			
 
				 		STARPU_ASSERT(ret == 0);
			
 
				 	}
			
 
				-	else
			
 
				+	else if (mpi_data->redux_map)
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
			
 
				+		STARPU_ASSERT(mpi_data->redux_map == REDUX_CONTRIB);
			
 
				+		_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", rank);
			
 
				 		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
			
 
				-		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
			
 
				+		starpu_data_invalidate_submit(data_handle);
			
 
				 	}
			
 
				-	/* FIXME: In order to prevent simultaneous receive submissions
			
 
				-	 * on the same handle, we need to wait that all the starpu_mpi
			
 
				-	 * tasks are done before submitting next tasks. The current
			
 
				-	 * version of the implementation does not support multiple
			
 
				-	 * simultaneous receive requests on the same handle.*/
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(5, "I am not in the map of %d, I am %d ...\n", rank, me);
			
 
				+	}
			
 
				+	if (mpi_data->redux_map != NULL)
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(100, "waiting for redux tasks with %d\n", rank);
			
 
				+		starpu_task_wait_for_all();
			
 
				+	}
			
 
				+	if (me == rank)
			
 
				+		free(mpi_data->redux_map);
			
 
				+	mpi_data->redux_map = NULL;
			
 
				 }
			
 
				 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 {
			
--- a/mpi/src/starpu_mpi_task_insert_fortran.c
+++ b/mpi/src/starpu_mpi_task_insert_fortran.c
@@ -74,7 +74,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 
				 				inconsistent_execute = 0;
			
 
				 			}
			
 
				 		}
			
 
				-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
			
 
				+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
			
 
				 		{
			
 
				 			arg_i++;
			
 
				 			starpu_data_handle_t data = arglist[arg_i];
			
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -37,7 +37,7 @@ static struct starpu_codelet init_codelet =
 
				 static struct starpu_codelet redux_codelet =
			
 
				 {
			
 
				 	.cpu_funcs = {redux_cpu_func},
			
 
				-	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
			
 
				 	.nbuffers = 2,
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	.model = &starpu_perfmodel_nop,
			
--- a/mpi/tests/mpi_redux.c
+++ b/mpi/tests/mpi_redux.c
@@ -14,6 +14,9 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+/* This test does a manual reduction: all ranks send a number to the rank 0,
			
 
				+ * the rank 0 sums these numbers and sends back the result to all ranks. */
			
 
				+
			
 
				 #include <starpu_mpi.h>
			
 
				 #include "helper.h"
			
 
				 
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -342,7 +342,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE1STR
			
@@ -356,7 +356,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE2STR
			
@@ -377,7 +377,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE2STR
			
@@ -388,7 +388,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE3STR
			
@@ -410,7 +410,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE3STR
			
@@ -421,7 +421,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE4STR
			
@@ -444,7 +444,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE4STR
			
@@ -455,7 +455,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE5STR
			
@@ -479,7 +479,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE5STR
			
@@ -490,7 +490,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE6STR
			
@@ -515,7 +515,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE6STR
			
@@ -526,7 +526,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_ALWAYS_PROBE7STR
			
@@ -552,7 +552,7 @@ do {									\
 
				 	snprintf((char *)futargs, len, "%s", str);			\
			
 
				 	((char *)futargs)[len - 1] = '\0';				\
			
 
				 	_STARPU_FUT_COMMIT(total_len);					\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef FUT_FULL_PROBE7STR
			
@@ -563,7 +563,7 @@ do {									\
 
				     if(KEYMASK & fut_active) {						\
			
 
				 	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
			
 
				     }									\
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifndef FUT_RAW_PROBE7
			
@@ -787,7 +787,7 @@ do {									\
 
				 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
			
 
				 		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				 	}								\
			
 
				-} while(0);
			
 
				+} while(0)
			
 
				 
			
 
				 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
			
 
				 do {									\
			
@@ -796,7 +796,7 @@ do {									\
 
				 	char _archname[32]=""; \
			
 
				 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
			
 
				 	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
			
 
				-} while(0);
			
 
				+} while(0)
			
 
				 
			
 
				 #define _STARPU_TRACE_START_EXECUTING()				\
			
 
				 	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
			
@@ -898,7 +898,7 @@ do {										\
 
				 	else {									\
			
 
				 		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
			
 
				 	}									\
			
 
				-} while(0);
			
 
				+} while(0)
			
 
				 
			
 
				 #define _STARPU_TRACE_DATA_NAME(handle, name) \
			
 
				 	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_DATA_NAME, handle, name)
			
@@ -1319,8 +1319,8 @@ do {										\
 
				 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
			
 
				        FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
			
 
				 
			
 
				-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre)          \
			
 
				-       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
			
 
				+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req)          \
			
 
				+       FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre, req)
			
 
				 
			
 
				 
			
 
				 #else // !STARPU_USE_FXT
			
@@ -1451,7 +1451,7 @@ do {										\
 
				 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)	do {(void)(handle); (void)(node);} while(0)
			
 
				 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
			
 
				 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
			
 
				-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre);} while(0)
			
 
				+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre); (void)(req); } while(0)
			
 
				 #define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
			
 
				 
			
 
				 #endif // STARPU_USE_FXT
			
--- a/src/common/hash.c
+++ b/src/common/hash.c
@@ -46,6 +46,11 @@ uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 
				 	return crc;
			
 
				 }
			
 
				 
			
 
				+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc)
			
 
				+{
			
 
				+	return starpu_hash_crc32c_be_n(&input, sizeof(input), inputcrc);
			
 
				+}
			
 
				+
			
 
				 uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
			
 
				 {
			
 
				 	uint8_t *p = (uint8_t *)&input;
			
--- a/src/common/uthash.h
+++ b/src/common/uthash.h
@@ -104,12 +104,12 @@ do {
 
				   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
			
 
				   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
			
 
				   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 
			
 
				 #define HASH_BLOOM_FREE(tbl)                                                     \
			
 
				 do {                                                                             \
			
 
				   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
			
 
				-} while (0);
			
 
				+} while (0)
			
 
				 
			
 
				 #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
			
 
				 #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
			
@@ -368,7 +368,7 @@ do {
 
				   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
			
 
				       hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
			
 
				   bkt = hashv & (num_bkts-1);                                                    \
			
 
				-} while(0);
			
 
				+} while(0)
			
 
				  
			
 
				 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				 do {                                                                             \
			
@@ -507,7 +507,7 @@ do {
 
				     hashv ^= hashv << 25;                                                        \
			
 
				     hashv += hashv >> 6;                                                         \
			
 
				     bkt = hashv & (num_bkts-1);                                                  \
			
 
				-} while(0);
			
 
				+} while(0)
			
 
				 
			
 
				 #ifdef HASH_USING_NO_STRICT_ALIASING
			
 
				 /* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.
			
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(0);
			
 
				+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -225,8 +225,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
				 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
			
 
				 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
			
 
				 
			
 
				-		if (mode & STARPU_R)
			
 
				-			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
			
 
				+		if (mode & STARPU_R && !handle->initialized)
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
			
 
				+			/* The task will initialize it with init_cl */
			
 
				+			handle->initialized = 1;
			
 
				+		}
			
 
				 
			
 
				 		if (mode & STARPU_W || mode == STARPU_REDUX)
			
 
				 		{
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -288,8 +288,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 	{
			
 
				 		unsigned long jobs = STARPU_ATOMIC_ADDL(&njobs_finished, 1);
			
 
				 
			
 
				-		printf("\r%lu tasks finished...", jobs);
			
 
				-		fflush(stdout);
			
 
				+		fprintf(stderr,"\r%lu tasks finished (last %lu %p)...", jobs, j->job_id, j->task);
			
 
				 	}
			
 
				 
			
 
				 	struct starpu_task *task = j->task;
			
--- a/src/core/perfmodel/energy_model.c
+++ b/src/core/perfmodel/energy_model.c
@@ -43,7 +43,7 @@
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
			
 
				+#define ERROR_RETURN(retval, function) do { PAPI_perror(function); fprintf(stderr, "Error %d %s:line %d\n", retval,__FILE__,__LINE__);  return(retval); } while (0)
			
 
				 
			
 
				 #if 0
			
 
				 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
			
@@ -52,6 +52,7 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				 static const int N_EVTS = 2;
			
 
				 
			
 
				 static int nsockets;
			
@@ -68,7 +69,7 @@ static int add_event(int EventSet, int socket);
 
				 
			
 
				 /*must be initialized to PAPI_NULL before calling PAPI_create_event*/
			
 
				 static int EventSet = PAPI_NULL;
			
 
				-
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 static double t1;
			
@@ -80,7 +81,7 @@ static nvmlDevice_t device;
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
			
 
				+int starpu_energy_start(int workerid STARPU_ATTRIBUTE_UNUSED, enum starpu_worker_archtype archi)
			
 
				 {
			
 
				 	t1 = starpu_timing_now();
			
 
				 
			
@@ -100,11 +101,11 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 
				 		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
			
 
				 
			
 
				 		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_library_init");
			
 
				 
			
 
				 		/* Creating the eventset */
			
 
				 		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_create_eventset");
			
 
				 
			
 
				 		int i;
			
 
				 		for (i = 0 ; i < nsockets ; i ++ )
			
@@ -112,19 +113,25 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 
				 			/* return the index of socket */
			
 
				 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
			
 
				 			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
			
 
				-				ERROR_RETURN(retval);
			
 
				+			{
			
 
				+				if (retval == PAPI_EPERM)
			
 
				+					_STARPU_DISP("PAPI could not access counters due to permissions errors. Perhaps your system requires to run measurements as root?\n");
			
 
				+				else if (retval == PAPI_ENOEVNT)
			
 
				+					_STARPU_DISP("PAPI could not access counters. Perhaps your system requires to run measurements as root?\n");
			
 
				+				ERROR_RETURN(retval, "PAPI_add_named_event");
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		/* get the number of events in the event set */
			
 
				 		number = 0;
			
 
				 		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_list_events");
			
 
				 
			
 
				 		debug("There are %d events in the event set\n", number);
			
 
				 
			
 
				 		/* Start counting */
			
 
				 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_start");
			
 
				 
			
 
				 		return retval;
			
 
				 	}
			
@@ -180,7 +187,7 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
				 
			
 
				 		/* Stop counting and store the values into the array */
			
 
				 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_stop");
			
 
				 
			
 
				 		int k,s;
			
 
				 
			
@@ -199,11 +206,11 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
				 
			
 
				 		/*removes all events from a PAPI event set */
			
 
				 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_cleanup_eventset");
			
 
				 
			
 
				 		/*deallocates the memory associated with an empty PAPI EventSet*/
			
 
				 		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
			
 
				-			ERROR_RETURN(retval);
			
 
				+			ERROR_RETURN(retval, "PAPI_destroy_eventset");
			
 
				 
			
 
				 		break;
			
 
				 	}
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1328,7 +1328,7 @@ static void write_bus_latency_file_content(void)
 
				 
			
 
				 	_STARPU_DEBUG("writing latencies to %s\n", path);
			
 
				 
			
 
				-	f = fopen(path, "w+");
			
 
				+	f = fopen(path, "a+");
			
 
				 	if (!f)
			
 
				 	{
			
 
				 		perror("fopen write_bus_latency_file_content");
			
@@ -1337,6 +1337,7 @@ static void write_bus_latency_file_content(void)
 
				 		STARPU_ABORT();
			
 
				 	}
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				+	fseek(f, 0, SEEK_SET);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 
			
 
				 	fprintf(f, "# ");
			
@@ -1684,10 +1685,11 @@ static void write_bus_bandwidth_file_content(void)
 
				 
			
 
				 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
			
 
				 
			
 
				-	f = fopen(path, "w+");
			
 
				+	f = fopen(path, "a+");
			
 
				 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
			
 
				 
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				+	fseek(f, 0, SEEK_SET);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 
			
 
				 	fprintf(f, "# ");
			
@@ -2124,9 +2126,10 @@ static void write_bus_config_file_content(void)
 
				 
			
 
				 	_STARPU_DEBUG("writing config to %s\n", path);
			
 
				 
			
 
				-	f = fopen(path, "w+");
			
 
				+	f = fopen(path, "a+");
			
 
				 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				+	fseek(f, 0, SEEK_SET);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 
			
 
				 	fprintf(f, "# Current configuration\n");
			
@@ -2655,7 +2658,7 @@ static void write_bus_platform_file_content(int version)
 
				 
			
 
				 	_STARPU_DEBUG("writing platform to %s\n", path);
			
 
				 
			
 
				-	f = fopen(path, "w+");
			
 
				+	f = fopen(path, "a+");
			
 
				 	if (!f)
			
 
				 	{
			
 
				 		perror("fopen write_bus_platform_file_content");
			
@@ -2664,6 +2667,7 @@ static void write_bus_platform_file_content(int version)
 
				 		STARPU_ABORT();
			
 
				 	}
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				+	fseek(f, 0, SEEK_SET);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 
			
 
				 	fprintf(f,
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1177,11 +1177,12 @@ void starpu_save_history_based_model(struct starpu_perfmodel *model)
 
				 
			
 
				 	/* overwrite existing file, or create it */
			
 
				 	FILE *f;
			
 
				-	f = fopen(path, "w+");
			
 
				+	f = fopen(path, "a+");
			
 
				 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
			
 
				 
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				 	check_model(model);
			
 
				+	fseek(f, 0, SEEK_SET);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 	dump_model_file(f, model);
			
 
				 	if (locked)
			
@@ -1610,10 +1611,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
				 	}
			
 
				 
			
 
				 	regmodel = &model->state->per_arch[comb][nimpl].regression;
			
 
				-	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
			
 
				 
			
 
				 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
			
 
				                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
			
 
				+	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
			
 
				 
			
 
				 docal:
			
 
				 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
			
@@ -1654,8 +1655,8 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 
			
 
				 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
			
 
				 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
			
 
				+		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -206,7 +206,7 @@ struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_c
 
				 	if (selected_policy)
			
 
				 		return selected_policy;
			
 
				 
			
 
				-	/* If no policy was specified, we use the eager policy by default */
			
 
				+	/* If no policy was specified, we use the lws policy by default */
			
 
				 	return &_starpu_sched_lws_policy;
			
 
				 }
			
 
				 
			
@@ -1153,25 +1153,6 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void _starpu_wait_on_sched_event(void)
			
 
				-{
			
 
				-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				-
			
 
				-	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
			
 
				-
			
 
				-	if (_starpu_machine_is_running())
			
 
				-	{
			
 
				-#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				-		STARPU_PTHREAD_COND_WAIT(&worker->sched_cond,
			
 
				-					  &worker->sched_mutex);
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				-}
			
 
				-
			
 
				 int starpu_push_local_task(int workerid, struct starpu_task *task, int back STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -63,8 +63,6 @@ struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx);
 
				 void _starpu_sched_post_exec_hook(struct starpu_task *task);
			
 
				 int _starpu_pop_task_end(struct starpu_task *task);
			
 
				 
			
 
				-void _starpu_wait_on_sched_event(void);
			
 
				-
			
 
				 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
			
 
				 						   unsigned int node) STARPU_ATTRIBUTE_MALLOC;
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 
			
 
				 	/* Do not start performance counter collection by default */
			
 
				 	conf->start_perf_counter_collection = 0;
			
 
				+
			
 
				+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1531,6 +1533,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 		_STARPU_DISP("Warning: STARPU_ENABLE_STATS is enabled, which slows down a bit\n");
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	if (starpu_get_env_number_default("STARPU_SIMGRID", 0))
			
 
				+	{
			
 
				+		_STARPU_DISP("Simulation mode requested, but this libstarpu was built without simgrid support, please recompile\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 #if defined(_WIN32) && !defined(__CYGWIN__)
			
 
				 	WSADATA wsadata;
			
 
				 	WSAStartup(MAKEWORD(1,0), &wsadata);
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -179,7 +179,6 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
				 
			
 
				 	/* the data is present now */
			
 
				 	unsigned requesting_node = requesting_replicate->memory_node;
			
 
				-	requesting_replicate->requested &= ~(1UL << requesting_node);
			
 
				 
			
 
				 	if (mode & STARPU_W)
			
 
				 	{
			
@@ -406,16 +405,18 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				 /* handle->lock should be taken. r is returned locked. The node parameter
			
 
				  * indicate either the source of the request, or the destination for a
			
 
				  * write-only request. */
			
 
				-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch)
			
 
				+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, struct starpu_task *task, enum starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 
			
 
				-	r = replicate->request[node];
			
 
				-
			
 
				-	if (r)
			
 
				+	for (r = replicate->request[node]; r; r = r->next_same_req)
			
 
				 	{
			
 
				 		_starpu_spin_checklocked(&r->handle->header_lock);
			
 
				 
			
 
				+		if (task && r->task && task != r->task)
			
 
				+			/* Do not collapse requests for different tasks */
			
 
				+			continue;
			
 
				+
			
 
				 		_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				                 /* perhaps we need to "upgrade" the request */
			
@@ -440,9 +441,12 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
				 
			
 
				 		if (mode & STARPU_W)
			
 
				 			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
			
 
				+
			
 
				+		/* We collapse with this request */
			
 
				+		return r;
			
 
				 	}
			
 
				 
			
 
				-	return r;
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -469,7 +473,9 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
				 
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode,
			
 
				+								  struct starpu_task *task,
			
 
				+								  enum starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
@@ -493,8 +499,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 		for (i = 0; i < nnodes; i++)
			
 
				 			for (j = 0; j < nnodes; j++)
			
 
				-				if (handle->per_node[i].request[j])
			
 
				+			{
			
 
				+				struct _starpu_data_request *r;
			
 
				+				for (r = handle->per_node[i].request[j]; r; r = r->next_same_req)
			
 
				 					nwait++;
			
 
				+			}
			
 
				 		/* If the request is not detached (i.e. the caller really wants
			
 
				 		 * proper ownership), no new requests will appear because a
			
 
				 		 * reference will be kept on the dst replicate, which will
			
@@ -531,6 +540,25 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 
			
 
				 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
			
 
				 			}
			
 
				+
			
 
				+			if (task)
			
 
				+			{
			
 
				+				unsigned j;
			
 
				+				unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				+				/* Cancel any existing (prefetch) request */
			
 
				+				struct _starpu_data_request *r2;
			
 
				+				for (j = 0; j < nnodes; j++)
			
 
				+				{
			
 
				+					for (r2 = dst_replicate->request[j]; r2; r2 = r2->next_same_req)
			
 
				+					{
			
 
				+						if (r2->task && r2->task == task)
			
 
				+						{
			
 
				+							r2->canceled = 1;
			
 
				+							break;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
@@ -568,11 +596,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
			
 
				 		if (mode & STARPU_W)
			
 
				 			dst_replicate->initialized = 1;
			
 
				-		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
			
 
				+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
			
 
				+			&& !_starpu_malloc_willpin_on_node(requesting_node))
			
 
				 		{
			
 
				-			/* And this is the main RAM, really no need for a
			
 
				-			 * request, just allocate */
			
 
				-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
			
 
				+			/* And this is the main RAM without pinning, really no need for a
			
 
				+			 * request, just quickly allocate and be done */
			
 
				+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
			
 
				 			{
			
 
				 				_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				 				if (dst_replicate->mc)
			
@@ -629,9 +658,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
			
 
				 
			
 
				 		/* Try to reuse a request if possible */
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
			
 
				+#endif
			
 
				 		r = _starpu_search_existing_data_request(hop_dst_replicate,
			
 
				 				(mode & STARPU_R)?hop_src_node:hop_dst_node,
			
 
				-							 mode, is_prefetch);
			
 
				+							 mode, task, is_prefetch);
			
 
				 
			
 
				 		reused_requests[hop] = !!r;
			
 
				 
			
@@ -640,7 +672,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 			/* Create a new request if there was no request to reuse */
			
 
				 			r = _starpu_create_data_request(handle, hop_src_replicate,
			
 
				 							hop_dst_replicate, hop_handling_node,
			
 
				-							mode, ndeps, is_prefetch, prio, 0, origin);
			
 
				+							mode, ndeps, task, is_prefetch, prio, 0, origin);
			
 
				 			nwait++;
			
 
				 		}
			
 
				 
			
@@ -686,7 +718,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		 */
			
 
				 		struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
			
 
				 							dst_replicate, requesting_node,
			
 
				-							STARPU_W, nwait, is_prefetch, prio, 1, origin);
			
 
				+							STARPU_W, nwait, task, is_prefetch, prio, 1, origin);
			
 
				 
			
 
				 		/* and perform the callback after termination */
			
 
				 		_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
@@ -701,8 +733,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		for (i = 0; i < nnodes; i++)
			
 
				 			for (j = 0; j < nnodes; j++)
			
 
				 			{
			
 
				-				struct _starpu_data_request *r2 = handle->per_node[i].request[j];
			
 
				-				if (r2)
			
 
				+				struct _starpu_data_request *r2;
			
 
				+				for (r2 = handle->per_node[i].request[j]; r2; r2 = r2->next_same_req)
			
 
				 				{
			
 
				 					_starpu_spin_lock(&r2->lock);
			
 
				 					if (is_prefetch < r2->prefetch)
			
@@ -736,7 +768,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 }
			
 
				 
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached,
			
 
				+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
 
				         _STARPU_LOG_IN();
			
@@ -745,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(1);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -790,7 +823,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 
			
 
				 	struct _starpu_data_request *r;
			
 
				 	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
			
 
				-						 is_prefetch, async, callback_func, callback_arg, prio, origin);
			
 
				+						 task, is_prefetch, async, callback_func, callback_arg, prio, origin);
			
 
				 
			
 
				 	/* If no request was created, the handle was already up-to-date on the
			
 
				 	 * node. In this case, _starpu_create_request_to_fetch_data has already
			
@@ -805,24 +838,24 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				         return ret;
			
 
				 }
			
 
				 
			
 
				-static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				-static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				-static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				-static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, task, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
			
 
				 }
			
 
				 
			
 
				 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
			
@@ -861,8 +894,15 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
				 void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				 	uint32_t wt_mask;
			
 
				+	size_t max_wt_mask = sizeof(wt_mask) * 8;
			
 
				+	unsigned wt_count = starpu_memory_nodes_get_count();
			
 
				+	if (max_wt_mask > STARPU_MAXNODES)
			
 
				+		max_wt_mask = STARPU_MAXNODES;
			
 
				+	if (wt_count > max_wt_mask)
			
 
				+		wt_count = max_wt_mask;
			
 
				+
			
 
				 	wt_mask = default_wt_mask | handle->wt_mask;
			
 
				-	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
			
 
				+	wt_mask &= (1ULL<<max_wt_mask)-1;
			
 
				 
			
 
				 	/* Note that it is possible that there is no valid copy of the data (if
			
 
				 	 * starpu_data_invalidate was called for instance). In that case, we do
			
@@ -871,14 +911,14 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 	unsigned memory_node = replicate->memory_node;
			
 
				 
			
 
				 	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
			
 
				-	if (wt_mask & ~(1<<memory_node))
			
 
				+	if (wt_mask && (memory_node >= max_wt_mask || wt_mask & ~(1<<memory_node)))
			
 
				 		_starpu_write_through_data(handle, memory_node, wt_mask);
			
 
				 
			
 
				 	int cpt = 0;
			
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(1);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -897,26 +937,6 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
			
 
				-{
			
 
				-	int cpt = 0;
			
 
				-	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				-	{
			
 
				-		cpt++;
			
 
				-		_starpu_datawizard_progress(1);
			
 
				-	}
			
 
				-	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				-		_starpu_spin_lock(&handle->header_lock);
			
 
				-
			
 
				-	if (replicate->state == STARPU_INVALID)
			
 
				-	{
			
 
				-		unsigned dst_node = replicate->memory_node;
			
 
				-		replicate->requested |= 1UL << dst_node;
			
 
				-	}
			
 
				-
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				-}
			
 
				-
			
 
				 int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 #ifdef STARPU_OPENMP
			
@@ -945,12 +965,9 @@ int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node,
 
				 
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				 		if (prefetch == STARPU_PREFETCH)
			
 
				-		{
			
 
				-			task_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				-			_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				-		}
			
 
				+			task_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
			
 
				 		else
			
 
				-			idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				+			idle_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
			
 
				 	}
			
 
				 
			
 
				 	if (prefetch == STARPU_PREFETCH)
			
@@ -1117,8 +1134,8 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
				 
			
 
				 		if (async)
			
 
				 		{
			
 
				-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
			
 
				-					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
			
 
				+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1,
			
 
				+					_starpu_fetch_task_input_cb, worker, task->priority, "_starpu_fetch_task_input");
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (_starpu_simgrid_fetching_input_cost())
			
 
				 				starpu_sleep(0.000001);
			
@@ -1133,7 +1150,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			ret = fetch_data(handle, node, local_replicate, mode, 0);
			
 
				+			ret = fetch_data(handle, node, local_replicate, mode, task, task->priority);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (_starpu_simgrid_fetching_input_cost())
			
 
				 				starpu_sleep(0.000001);
			
@@ -1371,7 +1388,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, -1, node);
			
 
				 
			
 
				-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
@@ -1421,7 +1438,7 @@ unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
 
				 
			
 
				 		for (i = 0; i < nnodes; i++)
			
 
				 		{
			
 
				-			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
			
 
				+			if (handle->per_node[node].request[i])
			
 
				 				ret = 1;
			
 
				 		}
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -72,15 +72,13 @@ struct _starpu_data_replicate
 
				 	 * */
			
 
				 	unsigned automatically_allocated:1;
			
 
				 
			
 
				-	/** To help the scheduling policies to make some decision, we
			
 
				-	   may keep a track of the tasks that are likely to request
			
 
				-	   this data on the current node.
			
 
				-	   It is the responsability of the scheduling _policy_ to set that
			
 
				-	   flag when it assigns a task to a queue, policies which do not
			
 
				-	   use this hint can simply ignore it.
			
 
				-	 */
			
 
				-	uint32_t requested;
			
 
				+	/** This tracks the list of requests to provide the value */
			
 
				 	struct _starpu_data_request *request[STARPU_MAXNODES];
			
 
				+	/** This points to the last entry of request, to easily append to the list */
			
 
				+	struct _starpu_data_request *last_request[STARPU_MAXNODES];
			
 
				+
			
 
				+	/* Which request is loading data here */
			
 
				+	struct _starpu_data_request *load_request;
			
 
				 
			
 
				 	/** The number of prefetches that we made for this replicate for various tasks
			
 
				 	 * This is also the number of tasks that we will wait to see use the mc before
			
@@ -322,7 +320,8 @@ struct _starpu_data_state
 
				  * async means that _starpu_fetch_data_on_node will wait for completion of the request
			
 
				  */
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached,
			
 
				+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 /** This releases a reference on the handle */
			
 
				 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
			
@@ -369,7 +368,8 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				  */
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode,
			
 
				+								  struct starpu_task *task, enum starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 									struct _starpu_data_replicate *dst_replicate,
			
 
				 									unsigned donotread,
			
 
				 									struct _starpu_data_request *req,
			
 
				-									unsigned may_alloc,
			
 
				+									enum _starpu_may_alloc may_alloc,
			
 
				 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	if (!donotread)
			
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	/* first make sure the destination has an allocated buffer */
			
 
				 	if (!dst_replicate->allocated)
			
 
				 	{
			
 
				-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
			
 
				+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
			
 
				 			/* We're not supposed to allocate there at the moment */
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
			
 
				+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
			
 
				 		if (ret_alloc)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,13 @@ extern "C"
 
				 struct _starpu_data_request;
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
 
				+enum _starpu_may_alloc
			
 
				+{
			
 
				+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
			
 
				+	STARPU_DATAWIZARD_DO_ALLOC,
			
 
				+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
			
 
				+};
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 /** MIC needs memory_node to know which MIC is concerned.
			
 
				  * mark is used to wait asynchronous request.
			
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
				 				    struct _starpu_data_replicate *dst_replicate,
			
 
				 				    unsigned donotread,
			
 
				 				    struct _starpu_data_request *req,
			
 
				-				    unsigned may_alloc,
			
 
				+				    enum _starpu_may_alloc may_alloc,
			
 
				 				    enum starpu_is_prefetch prefetch);
			
 
				 
			
 
				 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,57 +25,67 @@
 
				 #include <core/simgrid.h>
			
 
				 
			
 
				 /* requests that have not been treated at all */
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
			
 
				-#endif
			
 
				-static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
			
 
				-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
			
 
				-static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
			
 
				-static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES][STARPU_MAXNODES][2]; /* Contains both task_prefetch and prefetch */
			
 
				+static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				+static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				 
			
 
				 /* requests that are not terminated (eg. async transfers) */
			
 
				-static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES];
			
 
				-static unsigned data_requests_npending[STARPU_MAXNODES];
			
 
				-static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				+static unsigned data_requests_npending[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				+static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
			
 
				 
			
 
				 void _starpu_init_data_request_lists(void)
			
 
				 {
			
 
				-	unsigned i;
			
 
				+	unsigned i, j;
			
 
				+	enum _starpu_data_request_inout k;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		_starpu_data_request_prio_list_init(&data_requests[i]);
			
 
				-		_starpu_data_request_prio_list_init(&prefetch_requests[i]);
			
 
				-		_starpu_data_request_prio_list_init(&idle_requests[i]);
			
 
				+		for (j = 0; j < STARPU_MAXNODES; j++)
			
 
				+		{
			
 
				+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
			
 
				+			{
			
 
				+				_starpu_data_request_prio_list_init(&data_requests[i][j][k]);
			
 
				+				_starpu_data_request_prio_list_init(&prefetch_requests[i][j][k]);
			
 
				+				_starpu_data_request_prio_list_init(&idle_requests[i][j][k]);
			
 
				 
			
 
				 #ifndef STARPU_DEBUG
			
 
				-		/* Tell helgrind that we are fine with checking for list_empty
			
 
				-		 * in _starpu_handle_node_data_requests, we will call it
			
 
				-		 * periodically anyway */
			
 
				-		STARPU_HG_DISABLE_CHECKING(data_requests[i].tree.root);
			
 
				-		STARPU_HG_DISABLE_CHECKING(prefetch_requests[i].tree.root);
			
 
				-		STARPU_HG_DISABLE_CHECKING(idle_requests[i].tree.root);
			
 
				+				/* Tell helgrind that we are fine with checking for list_empty
			
 
				+				 * in _starpu_handle_node_data_requests, we will call it
			
 
				+				 * periodically anyway */
			
 
				+				STARPU_HG_DISABLE_CHECKING(data_requests[i][j][k].tree.root);
			
 
				+				STARPU_HG_DISABLE_CHECKING(prefetch_requests[i][j][k].tree.root);
			
 
				+				STARPU_HG_DISABLE_CHECKING(idle_requests[i][j][k].tree.root);
			
 
				 #endif
			
 
				+				_starpu_data_request_prio_list_init(&data_requests_pending[i][j][k]);
			
 
				+				data_requests_npending[i][j][k] = 0;
			
 
				 
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
			
 
				-
			
 
				-		_starpu_data_request_prio_list_init(&data_requests_pending[i]);
			
 
				-		data_requests_npending[i] = 0;
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
			
 
				+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i][j][k], NULL);
			
 
				+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i][j][k], NULL);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
			
 
				 }
			
 
				 
			
 
				 void _starpu_deinit_data_request_lists(void)
			
 
				 {
			
 
				-	unsigned i;
			
 
				+	unsigned i, j;
			
 
				+	enum _starpu_data_request_inout k;
			
 
				 	for (i = 0; i < STARPU_MAXNODES; i++)
			
 
				 	{
			
 
				-		_starpu_data_request_prio_list_deinit(&data_requests[i]);
			
 
				-		_starpu_data_request_prio_list_deinit(&prefetch_requests[i]);
			
 
				-		_starpu_data_request_prio_list_deinit(&idle_requests[i]);
			
 
				-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
			
 
				-		_starpu_data_request_prio_list_deinit(&data_requests_pending[i]);
			
 
				-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
			
 
				+		for (j = 0; j < STARPU_MAXNODES; j++)
			
 
				+		{
			
 
				+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
			
 
				+			{
			
 
				+				_starpu_data_request_prio_list_deinit(&data_requests[i][j][k]);
			
 
				+				_starpu_data_request_prio_list_deinit(&prefetch_requests[i][j][k]);
			
 
				+				_starpu_data_request_prio_list_deinit(&idle_requests[i][j][k]);
			
 
				+				_starpu_data_request_prio_list_deinit(&data_requests_pending[i][j][k]);
			
 
				+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i][j][k]);
			
 
				+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i][j][k]);
			
 
				+			}
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -92,23 +102,39 @@ static void _starpu_data_request_unlink(struct _starpu_data_request *r)
 
				 		STARPU_ASSERT(r->mode == STARPU_W);
			
 
				 		r->handle->write_invalidation_req = NULL;
			
 
				 	}
			
 
				-	else if (r->mode & STARPU_R)
			
 
				-	{
			
 
				-		/* If this is a read request, we store the pending requests
			
 
				-		 * between src and dst. */
			
 
				-		unsigned node = r->src_replicate->memory_node;
			
 
				-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
			
 
				-		r->dst_replicate->request[node] = NULL;
			
 
				-	}
			
 
				 	else
			
 
				 	{
			
 
				-		/* If this is a write only request, then there is no source and
			
 
				-		 * we use the destination node to cache the request. */
			
 
				-		unsigned node = r->dst_replicate->memory_node;
			
 
				-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
			
 
				-		r->dst_replicate->request[node] = NULL;
			
 
				-	}
			
 
				+		unsigned node;
			
 
				+		struct _starpu_data_request **prevp, *prev;
			
 
				+
			
 
				+		if (r->mode & STARPU_R)
			
 
				+			/* If this is a read request, we store the pending requests
			
 
				+			 * between src and dst. */
			
 
				+			node = r->src_replicate->memory_node;
			
 
				+		else
			
 
				+			/* If this is a write only request, then there is no source and
			
 
				+			 * we use the destination node to cache the request. */
			
 
				+			node = r->dst_replicate->memory_node;
			
 
				+
			
 
				+		/* Look for ourself in the list, we should be not very far. */
			
 
				+		for (prevp = &r->dst_replicate->request[node], prev = NULL;
			
 
				+		     *prevp && *prevp != r;
			
 
				+		     prev = *prevp, prevp = &prev->next_same_req)
			
 
				+			;
			
 
				 
			
 
				+		STARPU_ASSERT(*prevp == r);
			
 
				+		*prevp = r->next_same_req;
			
 
				+
			
 
				+		if (!r->next_same_req)
			
 
				+		{
			
 
				+			/* I was last */
			
 
				+			STARPU_ASSERT(r->dst_replicate->last_request[node] == r);
			
 
				+			if (prev)
			
 
				+				r->dst_replicate->last_request[node] = prev;
			
 
				+			else
			
 
				+				r->dst_replicate->last_request[node] = NULL;
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
			
@@ -124,6 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				+							 struct starpu_task *task,
			
 
				 							 enum starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
@@ -135,7 +162,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 
			
 
				 	_starpu_spin_init(&r->lock);
			
 
				 
			
 
				-	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch);
			
 
				+	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch, r);
			
 
				 
			
 
				 	r->origin = origin;
			
 
				 	r->handle = handle;
			
@@ -153,22 +180,48 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	if (handling_node == -1)
			
 
				 		handling_node = STARPU_MAIN_RAM;
			
 
				 	r->handling_node = handling_node;
			
 
				+	if (is_write_invalidation)
			
 
				+	{
			
 
				+		r->peer_node = handling_node;
			
 
				+		r->inout = _STARPU_DATA_REQUEST_IN;
			
 
				+	}
			
 
				+	else if (dst_replicate->memory_node == handling_node)
			
 
				+	{
			
 
				+		if (src_replicate)
			
 
				+			r->peer_node = src_replicate->memory_node;
			
 
				+		else
			
 
				+			r->peer_node = handling_node;
			
 
				+		r->inout = _STARPU_DATA_REQUEST_IN;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		r->peer_node = dst_replicate->memory_node;
			
 
				+		r->inout = _STARPU_DATA_REQUEST_OUT;
			
 
				+	}
			
 
				 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				 	r->completed = 0;
			
 
				+	r->added_ref = 0;
			
 
				+	r->canceled = 0;
			
 
				 	r->prefetch = is_prefetch;
			
 
				+	r->task = task;
			
 
				 	r->nb_tasks_prefetch = 0;
			
 
				 	r->prio = prio;
			
 
				 	r->retval = -1;
			
 
				 	r->ndeps = ndeps;
			
 
				+	r->next_same_req = NULL;
			
 
				 	r->next_req_count = 0;
			
 
				 	r->callbacks = NULL;
			
 
				 	r->com_id = 0;
			
 
				 
			
 
				 	_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				-	/* Take a reference on the target for the request to be able to write it */
			
 
				-	if (dst_replicate)
			
 
				+	/* For a fetch, take a reference as soon as now on the target, to avoid
			
 
				+	 * replicate eviction */
			
 
				+	if (is_prefetch == STARPU_FETCH && dst_replicate)
			
 
				+	{
			
 
				+		r->added_ref = 1;
			
 
				 		dst_replicate->refcnt++;
			
 
				+	}
			
 
				 	handle->busy_count++;
			
 
				 
			
 
				 	if (is_write_invalidation)
			
@@ -176,20 +229,28 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 		STARPU_ASSERT(!handle->write_invalidation_req);
			
 
				 		handle->write_invalidation_req = r;
			
 
				 	}
			
 
				-	else if (mode & STARPU_R)
			
 
				-	{
			
 
				-		unsigned src_node = src_replicate->memory_node;
			
 
				-		STARPU_ASSERT(!dst_replicate->request[src_node]);
			
 
				-		dst_replicate->request[src_node] = r;
			
 
				-		/* Take a reference on the source for the request to be able to read it */
			
 
				-		src_replicate->refcnt++;
			
 
				-		handle->busy_count++;
			
 
				-	}
			
 
				 	else
			
 
				 	{
			
 
				-		unsigned dst_node = dst_replicate->memory_node;
			
 
				-		STARPU_ASSERT(!dst_replicate->request[dst_node]);
			
 
				-		dst_replicate->request[dst_node] = r;
			
 
				+		unsigned node;
			
 
				+
			
 
				+		if (mode & STARPU_R)
			
 
				+			node = src_replicate->memory_node;
			
 
				+		else
			
 
				+			node = dst_replicate->memory_node;
			
 
				+
			
 
				+		if (!dst_replicate->request[node])
			
 
				+			dst_replicate->request[node] = r;
			
 
				+		else
			
 
				+			dst_replicate->last_request[node]->next_same_req = r;
			
 
				+		dst_replicate->last_request[node] = r;
			
 
				+
			
 
				+		if (mode & STARPU_R)
			
 
				+		{
			
 
				+			/* Take a reference on the source for the request to be
			
 
				+			 * able to read it */
			
 
				+			src_replicate->refcnt++;
			
 
				+			handle->busy_count++;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	r->refcnt = 1;
			
@@ -199,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				 	int retval;
			
 
				 	int do_delete = 0;
			
@@ -310,14 +371,14 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
				 	}
			
 
				 
			
 
				 	/* insert the request in the proper list */
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
			
 
				 	if (r->prefetch >= STARPU_IDLEFETCH)
			
 
				-		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
			
 
				+		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node][r->peer_node][r->inout], r);
			
 
				 	else if (r->prefetch > STARPU_FETCH)
			
 
				-		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
			
 
				+		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
			
 
				 	else
			
 
				-		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
			
 
				+		_starpu_data_request_prio_list_push_back(&data_requests[handling_node][r->peer_node][r->inout], r);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
			
 
				 
			
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	_starpu_wake_all_blocked_workers_on_node(handling_node);
			
@@ -352,7 +413,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
			
 
				 
			
 
				 
			
 
				-	if (dst_replicate)
			
 
				+	if (r->canceled < 2 && dst_replicate)
			
 
				 	{
			
 
				 #ifdef STARPU_MEMORY_STATS
			
 
				 		enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
			
@@ -360,6 +421,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 
			
 
				 		_starpu_spin_checklocked(&handle->header_lock);
			
 
				 		_starpu_update_data_state(handle, r->dst_replicate, mode);
			
 
				+		dst_replicate->load_request = NULL;
			
 
				 
			
 
				 #ifdef STARPU_MEMORY_STATS
			
 
				 		if (src_replicate->state == STARPU_INVALID)
			
@@ -382,7 +444,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				-	if (r->com_id > 0)
			
 
				+	if (r->canceled < 2 && r->com_id > 0)
			
 
				 	{
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		unsigned src_node = src_replicate->memory_node;
			
@@ -414,12 +476,15 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	/* Remove a reference on the destination replicate for the request */
			
 
				 	if (dst_replicate)
			
 
				 	{
			
 
				-		if (dst_replicate->mc)
			
 
				+		if (r->canceled < 2 && dst_replicate->mc)
			
 
				 			/* Make sure it stays there for the task.  */
			
 
				 			dst_replicate->nb_tasks_prefetch += r->nb_tasks_prefetch;
			
 
				 
			
 
				-		STARPU_ASSERT(dst_replicate->refcnt > 0);
			
 
				-		dst_replicate->refcnt--;
			
 
				+		if (r->added_ref)
			
 
				+		{
			
 
				+			STARPU_ASSERT(dst_replicate->refcnt > 0);
			
 
				+			dst_replicate->refcnt--;
			
 
				+		}
			
 
				 	}
			
 
				 	STARPU_ASSERT(handle->busy_count > 0);
			
 
				 	handle->busy_count--;
			
@@ -467,8 +532,16 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void _starpu_data_request_complete_wait(void *arg)
			
 
				+{
			
 
				+	struct _starpu_data_request *r = arg;
			
 
				+	_starpu_spin_lock(&r->handle->header_lock);
			
 
				+	_starpu_spin_lock(&r->lock);
			
 
				+	starpu_handle_data_request_completion(r);
			
 
				+}
			
 
				+
			
 
				 /* TODO : accounting to see how much time was spent working for other people ... */
			
 
				-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum starpu_is_prefetch prefetch)
			
 
				+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				 	starpu_data_handle_t handle = r->handle;
			
 
				 
			
@@ -491,12 +564,50 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
			
 
				 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
			
 
				 
			
 
				+	if (r->canceled)
			
 
				+	{
			
 
				+		/* Ok, canceled before starting copies etc. */
			
 
				+		r->canceled = 2;
			
 
				+		/* Nothing left to do */
			
 
				+		starpu_handle_data_request_completion(r);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (dst_replicate)
			
 
				+	{
			
 
				+		struct _starpu_data_request *r2 = dst_replicate->load_request;
			
 
				+		if (r2 && r2 != r)
			
 
				+		{
			
 
				+			/* Oh, some other transfer is already loading the value. Just wait for it */
			
 
				+			r->canceled = 2;
			
 
				+			_starpu_spin_unlock(&r->lock);
			
 
				+			_starpu_spin_lock(&r2->lock);
			
 
				+			_starpu_data_request_append_callback(r2, _starpu_data_request_complete_wait, r);
			
 
				+			_starpu_spin_unlock(&r2->lock);
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		/* We are loading this replicate.
			
 
				+		 * Note: we might fail to allocate memory, but we will keep on and others will wait for us. */
			
 
				+		dst_replicate->load_request = r;
			
 
				+	}
			
 
				+
			
 
				 	enum starpu_data_access_mode r_mode = r->mode;
			
 
				 
			
 
				 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
			
 
				 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
			
 
				 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
			
 
				 
			
 
				+	/* For prefetches, we take a reference on the destination only now that
			
 
				+	 * we will really try to fetch the data (instead of in
			
 
				+	 * _starpu_create_data_request) */
			
 
				+	if (dst_replicate && r->prefetch > STARPU_FETCH)
			
 
				+	{
			
 
				+		r->added_ref = 1;	/* Note: we might get upgraded while trying to allocate */
			
 
				+		dst_replicate->refcnt++;
			
 
				+	}
			
 
				+
			
 
				 	_starpu_spin_unlock(&r->lock);
			
 
				 
			
 
				 	/* FIXME: the request may get upgraded from here to freeing it... */
			
@@ -507,7 +618,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 
			
 
				 	if (dst_replicate && dst_replicate->state == STARPU_INVALID)
			
 
				 		r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
			
 
				-						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, prefetch);
			
 
				+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, r->prefetch);
			
 
				 	else
			
 
				 		/* Already valid actually, no need to transfer anything */
			
 
				 		r->retval = 0;
			
@@ -516,6 +627,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	{
			
 
				 		/* If there was not enough memory, we will try to redo the
			
 
				 		 * request later. */
			
 
				+
			
 
				+		if (r->prefetch > STARPU_FETCH)
			
 
				+		{
			
 
				+			STARPU_ASSERT(r->added_ref);
			
 
				+			/* Drop ref until next try */
			
 
				+			r->added_ref = 0;
			
 
				+			dst_replicate->refcnt--;
			
 
				+		}
			
 
				+
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 		return -ENOMEM;
			
 
				 	}
			
@@ -528,10 +648,10 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 		 * requests in the meantime. */
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				-		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node], r);
			
 
				-		data_requests_npending[r->handling_node]++;
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
			
 
				+		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node][r->peer_node][r->inout], r);
			
 
				+		data_requests_npending[r->handling_node][r->peer_node][r->inout]++;
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
			
 
				 
			
 
				 		return -EAGAIN;
			
 
				 	}
			
@@ -543,10 +663,9 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
			
 
				+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				-	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
			
 
				 	unsigned i;
			
 
				 	int ret = 0;
			
 
				 
			
@@ -556,48 +675,55 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 	/* This is racy, but not posing problems actually, since we know we
			
 
				 	 * will come back here to probe again regularly anyway.
			
 
				 	 * Thus, do not expose this optimization to helgrind */
			
 
				-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[src_node]))
			
 
				+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
			
 
				 		return 0;
			
 
				 #endif
			
 
				 
			
 
				-	/* TODO optimize */
			
 
				+	/* We create a new list to pickup some requests from the main list, and
			
 
				+	 * we handle the request(s) one by one from it, without concurrency issues.
			
 
				+	 */
			
 
				+	struct _starpu_data_request_list local_list, remain_list;
			
 
				+	_starpu_data_request_list_init(&local_list);
			
 
				 
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	/* take all the entries from the request list */
			
 
				-	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
			
 
				+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]))
			
 
				 	{
			
 
				 		/* List is busy, do not bother with it */
			
 
				 		return -EBUSY;
			
 
				 	}
			
 
				 #else
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
			
 
				 #endif
			
 
				 
			
 
				-	if (_starpu_data_request_prio_list_empty(&reqlist[src_node]))
			
 
				+	for (i = data_requests_npending[handling_node][peer_node][inout];
			
 
				+		i < n && ! _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]);
			
 
				+		i++)
			
 
				 	{
			
 
				-		/* there is no request */
			
 
				-                STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				-		return 0;
			
 
				+		r = _starpu_data_request_prio_list_pop_front_highest(&reqlist[handling_node][peer_node][inout]);
			
 
				+		_starpu_data_request_list_push_back(&local_list, r);
			
 
				 	}
			
 
				 
			
 
				-	/* There is an entry: we create a new empty list to replace the list of
			
 
				-	 * requests, and we handle the request(s) one by one in the former
			
 
				-	 * list, without concurrency issues.*/
			
 
				-	struct _starpu_data_request_prio_list local_list = reqlist[src_node];
			
 
				-	_starpu_data_request_prio_list_init(&reqlist[src_node]);
			
 
				+	if (!_starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
			
 
				+		/* We have left some requests */
			
 
				+		ret = -EBUSY;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+	if (_starpu_data_request_list_empty(&local_list))
			
 
				+		/* there is no request */
			
 
				+		return 0;
			
 
				 
			
 
				-	for (i = 0; i <= prefetch; i++)
			
 
				-		_starpu_data_request_prio_list_init(&new_data_requests[i]);
			
 
				+	/* This will contain the remaining requests */
			
 
				+	_starpu_data_request_list_init(&remain_list);
			
 
				 
			
 
				 	double start = starpu_timing_now();
			
 
				 	/* for all entries of the list */
			
 
				-	while (!_starpu_data_request_prio_list_empty(&local_list))
			
 
				+	while (!_starpu_data_request_list_empty(&local_list))
			
 
				 	{
			
 
				                 int res;
			
 
				 
			
 
				-		if (data_requests_npending[src_node] >= n)
			
 
				+		if (data_requests_npending[handling_node][peer_node][inout] >= n)
			
 
				 		{
			
 
				 			/* Too many requests at the same time, skip pushing
			
 
				 			 * more for now */
			
@@ -605,21 +731,22 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
			
 
				+		r = _starpu_data_request_list_pop_front(&local_list);
			
 
				 
			
 
				-		res = starpu_handle_data_request(r, may_alloc, prefetch);
			
 
				+		res = starpu_handle_data_request(r, may_alloc);
			
 
				 		if (res != 0 && res != -EAGAIN)
			
 
				 		{
			
 
				 			/* handle is busy, or not enough memory, postpone for now */
			
 
				 			ret = res;
			
 
				 			/* Prefetch requests might have gotten promoted while in tmp list */
			
 
				-			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
			
 
				+			_starpu_data_request_list_push_back(&remain_list, r);
			
 
				 			if (prefetch > STARPU_FETCH)
			
 
				 				/* Prefetching more there would make the situation even worse */
			
 
				 				break;
			
 
				 		}
			
 
				+		else
			
 
				+			(*pushed)++;
			
 
				 
			
 
				-		(*pushed)++;
			
 
				 		if (starpu_timing_now() - start >= MAX_PUSH_TIME)
			
 
				 		{
			
 
				 			/* We have spent a lot of time doing requests, skip pushing more for now */
			
@@ -628,43 +755,23 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	/* Push back requests we didn't handle on the proper list */
			
 
				-	while (!_starpu_data_request_prio_list_empty(&local_list))
			
 
				-	{
			
 
				-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
			
 
				-		/* Prefetch requests might have gotten promoted while in tmp list */
			
 
				-		_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
			
 
				-	}
			
 
				-	_starpu_data_request_prio_list_deinit(&local_list);
			
 
				-
			
 
				-	for (i = 0; i <= prefetch; i++)
			
 
				-		if (!_starpu_data_request_prio_list_empty(&new_data_requests[i]))
			
 
				-			break;
			
 
				+	/* Gather remainder */
			
 
				+	_starpu_data_request_list_push_list_back(&remain_list, &local_list);
			
 
				 
			
 
				-	if (i <= prefetch)
			
 
				+	if (!_starpu_data_request_list_empty(&remain_list))
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
			
 
				-		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
			
 
				-			data_requests[src_node] = new_data_requests[STARPU_FETCH];
			
 
				-		}
			
 
				-		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
			
 
				+		while (!_starpu_data_request_list_empty(&remain_list))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
			
 
				-			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
			
 
				-		}
			
 
				-		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
			
 
				-		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
			
 
				-			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
			
 
				-		}
			
 
				-		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
			
 
				-		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
			
 
				-			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
			
 
				+			r = _starpu_data_request_list_pop_back(&remain_list);
			
 
				+			if (r->prefetch >= STARPU_IDLEFETCH)
			
 
				+				_starpu_data_request_prio_list_push_front(&idle_requests[handling_node][r->peer_node][r->inout], r);
			
 
				+			else if (r->prefetch > STARPU_FETCH)
			
 
				+				_starpu_data_request_prio_list_push_front(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
			
 
				+			else
			
 
				+				_starpu_data_request_prio_list_push_front(&data_requests[handling_node][r->peer_node][r->inout], r);
			
 
				 		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		if (*pushed)
			
@@ -676,32 +783,32 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 			 * for eviction to happen.
			
 
				 			 */
			
 
				 			starpu_sleep(0.000001);
			
 
				-			_starpu_wake_all_blocked_workers_on_node(src_node);
			
 
				+			_starpu_wake_all_blocked_workers_on_node(handling_node);
			
 
				 		}
			
 
				 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)
			
 
				-		_starpu_wake_all_blocked_workers_on_node(src_node);
			
 
				+		_starpu_wake_all_blocked_workers_on_node(handling_node);
			
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
			
 
				+	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
			
 
				+	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
			
 
				+	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
			
 
				 }
			
 
				 
			
 
				-static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
			
 
				+static int _handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned force)
			
 
				 {
			
 
				 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
			
 
				 //
			
@@ -712,14 +819,14 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
				 	/* Here helgrind would should that this is an un protected access.
			
 
				 	 * We however don't care about missing an entry, we will get called
			
 
				 	 * again sooner or later. */
			
 
				-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
			
 
				+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
			
 
				 		return 0;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	if (!force)
			
 
				 	{
			
 
				-		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]))
			
 
				+		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]))
			
 
				 		{
			
 
				 			/* List is busy, do not bother with it */
			
 
				 			return 0;
			
@@ -728,19 +835,19 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
				 	else
			
 
				 #endif
			
 
				 		/* We really want to handle requests */
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
			
 
				 
			
 
				-	if (_starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
			
 
				+	if (_starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
			
 
				 	{
			
 
				 		/* there is no request */
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	/* for all entries of the list */
			
 
				-	struct _starpu_data_request_prio_list local_list = data_requests_pending[src_node];
			
 
				-	_starpu_data_request_prio_list_init(&data_requests_pending[src_node]);
			
 
				+	struct _starpu_data_request_prio_list local_list = data_requests_pending[handling_node][peer_node][inout];
			
 
				+	_starpu_data_request_prio_list_init(&data_requests_pending[handling_node][peer_node][inout]);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
			
 
				 
			
 
				 	_starpu_data_request_prio_list_init(&new_data_requests_pending);
			
 
				 	taken = 0;
			
@@ -803,55 +910,75 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
				 		}
			
 
				 	}
			
 
				 	_starpu_data_request_prio_list_deinit(&local_list);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				-	data_requests_npending[src_node] -= taken - kept;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
			
 
				+	data_requests_npending[handling_node][peer_node][inout] -= taken - kept;
			
 
				 	if (kept)
			
 
				-		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[src_node], &new_data_requests_pending);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
			
 
				+		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[handling_node][peer_node][inout], &new_data_requests_pending);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
			
 
				 
			
 
				 	return taken - kept;
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_pending_node_data_requests(unsigned src_node)
			
 
				+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
			
 
				 {
			
 
				-	return _handle_pending_node_data_requests(src_node, 0);
			
 
				+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 0);
			
 
				 }
			
 
				 
			
 
				-int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
			
 
				+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
			
 
				 {
			
 
				-	return _handle_pending_node_data_requests(src_node, 1);
			
 
				+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 1);
			
 
				 }
			
 
				 
			
 
				 /* Note: the returned value will be outdated since the locks are not taken at
			
 
				  * entry/exit */
			
 
				-int _starpu_check_that_no_data_request_exists(unsigned node)
			
 
				+static int __starpu_check_that_no_data_request_exists(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
			
 
				 {
			
 
				 	int no_request;
			
 
				 	int no_pending;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node]);
			
 
				-	no_request = _starpu_data_request_prio_list_empty(&data_requests[node])
			
 
				-	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node])
			
 
				-		  && _starpu_data_request_prio_list_empty(&idle_requests[node]);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
			
 
				-	no_pending = !data_requests_npending[node];
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node][peer_node][inout]);
			
 
				+	no_request = _starpu_data_request_prio_list_empty(&data_requests[node][peer_node][inout])
			
 
				+	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node][peer_node][inout])
			
 
				+		  && _starpu_data_request_prio_list_empty(&idle_requests[node][peer_node][inout]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node][peer_node][inout]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
			
 
				+	no_pending = !data_requests_npending[node][peer_node][inout];
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
			
 
				 
			
 
				 	return no_request && no_pending;
			
 
				 }
			
 
				 
			
 
				+int _starpu_check_that_no_data_request_exists(unsigned node)
			
 
				+{
			
 
				+	unsigned peer_node, nnodes = starpu_memory_nodes_get_count();
			
 
				+
			
 
				+	for (peer_node = 0; peer_node < nnodes; peer_node++)
			
 
				+		if (!__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_IN)
			
 
				+		 || !__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_OUT))
			
 
				+		 return 0;
			
 
				+	 return 1;
			
 
				+}
			
 
				+
			
 
				 /* Note: the returned value will be outdated since the locks are not taken at
			
 
				  * entry/exit */
			
 
				-int _starpu_check_that_no_data_request_is_pending(unsigned node)
			
 
				+int _starpu_check_that_no_data_request_is_pending(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
			
 
				 {
			
 
				-	return !data_requests_npending[node];
			
 
				+	return !data_requests_npending[node][peer_node][inout];
			
 
				 }
			
 
				 
			
 
				 
			
 
				 void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_is_prefetch prefetch)
			
 
				 {
			
 
				+	_starpu_spin_checklocked(&r->handle->header_lock);
			
 
				 	STARPU_ASSERT(r->prefetch > prefetch);
			
 
				+
			
 
				+	if (prefetch == STARPU_FETCH && !r->added_ref)
			
 
				+	{
			
 
				+		/* That would have been done by _starpu_create_data_request */
			
 
				+		r->added_ref = 1;
			
 
				+		r->dst_replicate->refcnt++;
			
 
				+	}
			
 
				+
			
 
				 	r->prefetch=prefetch;
			
 
				 
			
 
				 	if (prefetch >= STARPU_IDLEFETCH)
			
@@ -867,27 +994,27 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_
 
				 			_starpu_update_prefetch_status(next_req, prefetch);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
			
 
				 
			
 
				 	int found = 1;
			
 
				 
			
 
				 	/* The request can be in a different list (handling request or the temp list)
			
 
				 	 * we have to check that it is really in the prefetch or idle list. */
			
 
				-	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
			
 
				-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
			
 
				-	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
			
 
				-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
			
 
				+	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r))
			
 
				+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r);
			
 
				+	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node][r->peer_node][r->inout], r))
			
 
				+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node][r->peer_node][r->inout], r);
			
 
				 	else
			
 
				 		found = 0;
			
 
				 
			
 
				 	if (found)
			
 
				 	{
			
 
				 		if (prefetch > STARPU_FETCH)
			
 
				-			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
			
 
				+			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node][r->peer_node][r->inout],r);
			
 
				 		else
			
 
				-			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
			
 
				+			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node][r->peer_node][r->inout],r);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
			
 
				 
			
 
				 #ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	_starpu_wake_all_blocked_workers_on_node(r->handling_node);
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -32,8 +32,8 @@
 
				  * Data interfaces should also have to declare how many asynchronous requests
			
 
				  * they have actually started (think of e.g. csr).
			
 
				  */
			
 
				-#define MAX_PENDING_REQUESTS_PER_NODE 20
			
 
				-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
			
 
				+#define MAX_PENDING_REQUESTS_PER_NODE 5
			
 
				+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 2
			
 
				 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
			
 
				 /** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
			
 
				 #define MAX_PUSH_TIME 1000
			
@@ -47,6 +47,11 @@ struct _starpu_callback_list
 
				 	struct _starpu_callback_list *next;
			
 
				 };
			
 
				 
			
 
				+enum _starpu_data_request_inout
			
 
				+{
			
 
				+	_STARPU_DATA_REQUEST_IN, _STARPU_DATA_REQUEST_OUT
			
 
				+};
			
 
				+
			
 
				 /** This represents a data request, i.e. we want some data to get transferred
			
 
				  * from a source to a destination. */
			
 
				 LIST_TYPE(_starpu_data_request,
			
@@ -63,6 +68,8 @@ LIST_TYPE(_starpu_data_request,
 
				 	 * the node can make the CUDA/OpenCL calls.
			
 
				 	 */
			
 
				 	unsigned handling_node;
			
 
				+	unsigned peer_node;
			
 
				+	enum _starpu_data_request_inout inout;
			
 
				 
			
 
				 	/*
			
 
				 	 * What the destination node wants to do with the data: write to it,
			
@@ -78,10 +85,19 @@ LIST_TYPE(_starpu_data_request,
 
				 	struct _starpu_async_channel async_channel;
			
 
				 
			
 
				 	/** Whether the transfer is completed. */
			
 
				-	unsigned completed;
			
 
				+	unsigned completed:1;
			
 
				+
			
 
				+	/** Whether we have already added our reference to the dst replicate. */
			
 
				+	unsigned added_ref:1;
			
 
				+
			
 
				+	/** Whether the request was canceled before being handled (because the transfer already happened another way). */
			
 
				+	unsigned canceled:2;
			
 
				 
			
 
				 	/** Whether this is just a prefetch request */
			
 
				-	enum starpu_is_prefetch prefetch;
			
 
				+	enum starpu_is_prefetch prefetch:3;
			
 
				+
			
 
				+	/** Task this request is for */
			
 
				+	struct starpu_task *task;
			
 
				 
			
 
				 	/** Number of tasks which used this as a prefetch */
			
 
				 	unsigned nb_tasks_prefetch;
			
@@ -96,6 +112,10 @@ LIST_TYPE(_starpu_data_request,
 
				 	 * dependencies. */
			
 
				 	unsigned ndeps;
			
 
				 
			
 
				+	/** Some further tasks may have requested prefetches for the same data
			
 
				+	 * much later on, link with them */
			
 
				+	struct _starpu_data_request *next_same_req;
			
 
				+
			
 
				 	/** in case we have a chain of request (eg. for nvidia multi-GPU), this
			
 
				 	 * is the list of requests which are waiting for this one. */
			
 
				 	struct _starpu_data_request *next_req[STARPU_MAXNODES+1];
			
@@ -123,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
				 
			
 
				 	int prio;
			
 
				 
			
 
				-	/** if this is more complicated ... (eg. application request) 
			
 
				+	/** if this is more complicated ... (eg. application request)
			
 
				 	 * NB: this callback is not called with the lock taken !
			
 
				 	 */
			
 
				 	void (*ready_data_callback)(void *argcb);
			
@@ -135,15 +155,15 @@ void _starpu_init_data_request_lists(void);
 
				 void _starpu_deinit_data_request_lists(void);
			
 
				 void _starpu_post_data_request(struct _starpu_data_request *r);
			
 
				 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
			
 
				-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
			
 
				-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
			
 
				-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
			
 
				 
			
 
				-int _starpu_handle_pending_node_data_requests(unsigned src_node);
			
 
				-int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
			
 
				+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
			
 
				+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
			
 
				 
			
 
				-int _starpu_check_that_no_data_request_exists(unsigned node);
			
 
				-int _starpu_check_that_no_data_request_is_pending(unsigned node);
			
 
				+int _starpu_check_that_no_data_request_exists(unsigned handling_node);
			
 
				+int _starpu_check_that_no_data_request_is_pending(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
			
 
				 
			
 
				 struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
			
 
				 							 struct _starpu_data_replicate *src_replicate,
			
@@ -151,12 +171,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				+							 struct starpu_task *task,
			
 
				 							 enum starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
			
 
				 
			
 
				-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
			
 
				+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
			
 
				 
			
 
				 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
			
 
				 					  void (*callback_func)(void *),
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -26,19 +26,17 @@
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
			
 
				+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				-
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* XXX */
			
 
				-	starpu_sleep(0.000001);
			
 
				-#endif
			
 
				-	STARPU_UYIELD();
			
 
				+	unsigned peer_node;
			
 
				 
			
 
				 	/* in case some other driver requested data */
			
 
				-	if (_starpu_handle_pending_node_data_requests(memory_node))
			
 
				-		ret = 1;
			
 
				+	for (peer_node = peer_start; peer_node < peer_end; peer_node++)
			
 
				+	{
			
 
				+		if (_starpu_handle_pending_node_data_requests(memory_node, peer_node, inout))
			
 
				+			ret = 1;
			
 
				+	}
			
 
				 
			
 
				 	starpu_memchunk_tidy(memory_node);
			
 
				 
			
@@ -46,26 +44,70 @@ int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsi
 
				 	{
			
 
				 		/* Some transfers have finished, or the driver requests to really push more */
			
 
				 		unsigned pushed;
			
 
				-		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
			
 
				+		unsigned ok = 1;
			
 
				+
			
 
				+		for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
			
 
				 		{
			
 
				+			if (_starpu_handle_node_data_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
			
 
				+				ok = 0;
			
 
				 			if (pushed)
			
 
				 				ret = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (ok)
			
 
				+		{
			
 
				+			unsigned doidle = 1;
			
 
				+
			
 
				 			/* We pushed all pending requests, we can afford pushing
			
 
				 			 * prefetch requests */
			
 
				-			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
			
 
				-			if (_starpu_check_that_no_data_request_is_pending(memory_node))
			
 
				+			for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
			
 
				+			{
			
 
				+				if (_starpu_handle_node_prefetch_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
			
 
				+					ok = 0;
			
 
				+				if (pushed)
			
 
				+					ret = 1;
			
 
				+				if (!_starpu_check_that_no_data_request_is_pending(memory_node, peer_node, inout))
			
 
				+					doidle = 0;
			
 
				+			}
			
 
				+
			
 
				+			if (doidle)
			
 
				 				/* No pending transfer, push some idle transfer */
			
 
				-				_starpu_handle_node_idle_requests(memory_node, may_alloc, &pushed);
			
 
				+				for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
			
 
				+				{
			
 
				+					if (_starpu_handle_node_idle_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
			
 
				+						ok = 0;
			
 
				+					if (pushed)
			
 
				+						ret = 1;
			
 
				+				}
			
 
				 		}
			
 
				-		if (pushed)
			
 
				-			ret = 1;
			
 
				+
			
 
				 	}
			
 
				-	_starpu_execute_registered_progression_hooks();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
			
 
				+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	unsigned peer_node;
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	/* XXX */
			
 
				+	starpu_sleep(0.000001);
			
 
				+#endif
			
 
				+	STARPU_UYIELD();
			
 
				+
			
 
				+	/* First handle all incoming transfers */
			
 
				+	ret |= ____starpu_datawizard_progress(memory_node, 0, nnodes, _STARPU_DATA_REQUEST_IN, may_alloc, push_requests);
			
 
				+
			
 
				+	/* Then handle outgoing transfers */
			
 
				+	for (peer_node = 0; peer_node < nnodes; peer_node++)
			
 
				+		ret |= ____starpu_datawizard_progress(memory_node, peer_node, peer_node+1, _STARPU_DATA_REQUEST_OUT, may_alloc, push_requests);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				         unsigned memnode;
			
@@ -77,7 +119,8 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
				 		int nnumas = starpu_memory_nodes_get_numa_count();
			
 
				 		int numa;
			
 
				 		for (numa = 0; numa < nnumas; numa++)
			
 
				-			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
			
 
				+			ret |=  ___starpu_datawizard_progress(numa, nnumas, may_alloc, push_requests);
			
 
				+		_starpu_execute_registered_progression_hooks();
			
 
				 
			
 
				 		return ret;
			
 
				 	}
			
@@ -87,19 +130,38 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
				 		worker = &worker->set->workers[0];
			
 
				 
			
 
				 	unsigned current_worker_id = worker->workerid;
			
 
				-        int ret = 0;
			
 
				+	int ret = 0;
			
 
				 	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 
			
 
				-        for (memnode = 0; memnode < nnodes; memnode++)
			
 
				-        {
			
 
				-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
			
 
				-                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
			
 
				-        }
			
 
				+	for (memnode = 0; memnode < nnodes; memnode++)
			
 
				+	{
			
 
				+		if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
			
 
				+		{
			
 
				+			if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
			
 
				+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
			
 
				+			else
			
 
				+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
			
 
				+			}
			
 
				+	}
			
 
				+
			
 
				+	_starpu_execute_registered_progression_hooks();
			
 
				 
			
 
				         return ret;
			
 
				 }
			
 
				 
			
 
				-void _starpu_datawizard_progress(unsigned may_alloc)
			
 
				+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
			
 
				 {
			
 
				         __starpu_datawizard_progress(may_alloc, 1);
			
 
				 }
			
 
				+
			
 
				+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode)
			
 
				+{
			
 
				+	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				+	unsigned memnode2;
			
 
				+
			
 
				+	for (memnode2 = 0; memnode2 < nnodes; memnode2++)
			
 
				+	{
			
 
				+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_IN);
			
 
				+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_OUT);
			
 
				+	}
			
 
				+}
			
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -34,18 +34,19 @@
 
				 
			
 
				 #include <core/dependencies/implicit_data_deps.h>
			
 
				 
			
 
				-/** Make data transfers progress on node \p memory_node.
			
 
				+
			
 
				+/** Make data transfers progress on all memory nodes driven by the current worker.
			
 
				  *
			
 
				  * If \p push_requests is 1, it can start new transfers
			
 
				  *
			
 
				- * If \p may_alloc is 1, it can allocate destination data for transfers
			
 
				+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
			
 
				  * (this is not possible e.g. when spinning for a handle lock)
			
 
				  */
			
 
				-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
			
 
				-/** Call ___starpu_datawizard_progress() for all memory nodes driven by the
			
 
				- * current worker */
			
 
				-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
			
 
				+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
			
 
				 /** Call __starpu_datawizard_progress with push_requests = 1 */
			
 
				-void _starpu_datawizard_progress(unsigned may_alloc);
			
 
				+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
			
 
				+
			
 
				+/** Check for all pending data request progress on node \p memory_node */
			
 
				+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);
			
 
				 
			
 
				 #endif // __DATAWIZARD_H__
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		int home_node = initial_handle->home_node;
			
 
				 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				 			home_node = STARPU_MAIN_RAM;
			
 
				-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
 
				 #endif
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -375,13 +375,14 @@ _starpu_data_initialize_per_worker(starpu_data_handle_t handle)
 
				 		replicate->state = STARPU_INVALID;
			
 
				 		//replicate->refcnt = 0;
			
 
				 		replicate->handle = handle;
			
 
				-		//replicate->requested = 0;
			
 
				 		//replicate->nb_tasks_prefetch = 0;
			
 
				 
			
 
				 		//for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		//{
			
 
				 		//	replicate->request[node] = NULL;
			
 
				+		//	replicate->last_request[node] = NULL;
			
 
				 		//}
			
 
				+		//replicate->load_request = NULL;
			
 
				 
			
 
				 		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
			
 
				 		replicate->relaxed_coherency = 1;
			
@@ -785,7 +786,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
				 	}
			
 
				 	if (valid)
			
 
				 	{
			
 
				-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
			
 
				+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, origin);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
			
 
				 	}
			
@@ -1033,6 +1034,7 @@ retry_busy:
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				 		struct _starpu_data_replicate *local = &handle->per_node[node];
			
 
				+		STARPU_ASSERT(!local->refcnt);
			
 
				 		if (local->allocated)
			
 
				 		{
			
 
				 			_starpu_data_unregister_ram_pointer(handle, node);
			
@@ -1049,6 +1051,7 @@ retry_busy:
 
				 		for (worker = 0; worker < nworkers; worker++)
			
 
				 		{
			
 
				 			struct _starpu_data_replicate *local = &handle->per_worker[worker];
			
 
				+			STARPU_ASSERT(!local->refcnt);
			
 
				 			/* free the data copy in a lazy fashion */
			
 
				 			if (local->allocated && local->automatically_allocated)
			
 
				 				_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -149,6 +149,15 @@ static int _starpu_malloc_should_pin(int flags)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int _starpu_malloc_willpin_on_node(unsigned dst_node)
			
 
				+{
			
 
				+	int flags = malloc_on_node_default_flags[dst_node];
			
 
				+	return (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0
			
 
				+			&& (_starpu_can_submit_cuda_task()
			
 
				+			    /* || _starpu_can_submit_opencl_task() */
			
 
				+			));
			
 
				+}
			
 
				+
			
 
				 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
			
 
				 {
			
 
				 	int ret=0;
			
@@ -185,6 +194,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 		goto end;
			
 
				 	}
			
 
				 
			
 
				+	/* Note: synchronize this test with _starpu_malloc_willpin_on_node */
			
 
				 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
			
 
				 	{
			
 
				 		if (_starpu_can_submit_cuda_task())
			
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -26,4 +26,11 @@ void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
				 
			
 
				 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
			
 
				 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
			
 
				+
			
 
				+/**
			
 
				+   Returns whether when allocating data on \p dst_node, we will do pinning, i.e.
			
 
				+   the allocation will be very expensive, and should thus be moved out from the
			
 
				+   critical path
			
 
				+  */
			
 
				+int _starpu_malloc_willpin_on_node(unsigned dst_node);
			
 
				 #endif
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -169,7 +169,10 @@ void _starpu_mem_chunk_disk_register(unsigned disk_memnode)
 
				 	{
			
 
				 		enum starpu_node_kind kind = starpu_node_get_kind(i);
			
 
				 		if (kind == STARPU_CPU_RAM)
			
 
				+		{
			
 
				+			STARPU_HG_DISABLE_CHECKING(evictable[i]);
			
 
				 			evictable[i] = 1;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -327,7 +330,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
				 		{
			
 
				 			/* This is the only copy, push it to destination */
			
 
				 			struct _starpu_data_request *r;
			
 
				-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				 			/* There is no way we don't need a request, since
			
 
				 			 * source is OWNER, destination can't be having it */
			
 
				 			STARPU_ASSERT(r);
			
@@ -552,8 +555,9 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 
			
 
				 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				+	STARPU_ASSERT(node < STARPU_MAXNODES);
			
 
				 	/* This data should be written through to this node, avoid dropping it! */
			
 
				-	if (handle->wt_mask & (1<<node))
			
 
				+	if (node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
			
 
				 		return 0;
			
 
				 
			
 
				 	/* This data was registered from this node, we will not be able to drop it anyway */
			
@@ -1012,7 +1016,7 @@ restart2:
 
				 				next_mc->remove_notify = &next_mc;
			
 
				 			}
			
 
				 			/* Note: this may unlock mc_list! */
			
 
				-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
			
 
				+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, is_prefetch);
			
 
				 
			
 
				 			if (orig_next_mc)
			
 
				 			{
			
@@ -1179,7 +1183,7 @@ void starpu_memchunk_tidy(unsigned node)
 
				 			if (
			
 
				 				/* This data should be written through to this node, avoid
			
 
				 				 * dropping it! */
			
 
				-				handle->wt_mask & (1<<node)
			
 
				+				(node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
			
 
				 				/* This is partitioned, don't care about the
			
 
				 				 * whole data, we'll work on the subdatas.  */
			
 
				 			     || handle->nchildren
			
@@ -1231,7 +1235,7 @@ void starpu_memchunk_tidy(unsigned node)
 
				 			}
			
 
				 
			
 
				 			_starpu_spin_unlock(&mc_lock[node]);
			
 
				-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, NULL, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				 			{
			
 
				 				/* No request was actually needed??
			
 
				 				 * Odd, but cope with it.  */
			
@@ -1442,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				  *
			
 
				  */
			
 
				 
			
 
				-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
			
 
				+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
			
 
				 {
			
 
				 	unsigned attempts = 0;
			
 
				 	starpu_ssize_t allocated_memory;
			
@@ -1473,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	if (!prefetch_oom)
			
 
				 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
			
 
				 #endif
			
 
				+
			
 
				+	/* If this is RAM and pinned this will be slow
			
 
				+	   In case we only want fast allocations return here */
			
 
				+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				 	STARPU_ASSERT(handle->ops);
			
 
				 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
			
 
				 	STARPU_ASSERT(replicate->data_interface);
			
@@ -1576,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(0);
			
 
				+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -1620,7 +1630,7 @@ out:
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
			
 
				 {
			
 
				 	starpu_ssize_t allocated_memory;
			
 
				 
			
@@ -1635,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
				 		return 0;
			
 
				 
			
 
				 	STARPU_ASSERT(replicate->data_interface);
			
 
				-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
			
 
				+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
			
 
				 
			
 
				 	/* perhaps we could really not handle that capacity misses */
			
 
				 	if (allocated_memory == -ENOMEM)
			
@@ -1845,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				 			for (i=0; i<nb_numa_nodes; i++)
			
 
				 			{
			
 
				-				if (handle->per_node[i].allocated || 
			
 
				+				if (handle->per_node[i].allocated ||
			
 
				 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				 				{
			
 
				 					target = i;
			
@@ -1877,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				 			for (i=0; i<nb_numa_nodes; i++)
			
 
				 			{
			
 
				-				if (handle->per_node[i].allocated || 
			
 
				+				if (handle->per_node[i].allocated ||
			
 
				 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				 				{
			
 
				 					target = i;
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				 void _starpu_mem_chunk_init_last(void);
			
 
				 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
 
				 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -151,6 +151,7 @@ void _starpu_memory_node_register_condition(struct _starpu_worker *worker, starp
 
				 #undef starpu_worker_get_memory_node
			
 
				 unsigned starpu_worker_get_memory_node(unsigned workerid)
			
 
				 {
			
 
				+	(void) workerid;
			
 
				 	return _starpu_worker_get_memory_node(workerid);
			
 
				 }
			
 
				 
			
@@ -167,12 +168,10 @@ void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned m
 
				 	}
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_get_local_memory_node
			
 
				 unsigned starpu_worker_get_local_memory_node(void)
			
 
				 {
			
 
				-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				-	if (!worker)
			
 
				-		return STARPU_MAIN_RAM;
			
 
				-	return worker->memory_node;
			
 
				+	return _starpu_worker_get_local_memory_node();
			
 
				 }
			
 
				 
			
 
				 int starpu_memory_node_get_devid(unsigned node)
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -117,12 +117,19 @@ static inline enum starpu_node_kind _starpu_node_get_kind(unsigned node)
 
				 }
			
 
				 #define starpu_node_get_kind _starpu_node_get_kind
			
 
				 
			
 
				+#if STARPU_MAXNODES == 1
			
 
				+#define _starpu_memory_nodes_get_count() 1
			
 
				+#else
			
 
				 static inline unsigned _starpu_memory_nodes_get_count(void)
			
 
				 {
			
 
				 	return _starpu_descr.nnodes;
			
 
				 }
			
 
				+#endif
			
 
				 #define starpu_memory_nodes_get_count _starpu_memory_nodes_get_count
			
 
				 
			
 
				+#if STARPU_MAXNODES == 1
			
 
				+#define _starpu_worker_get_memory_node(workerid) 0
			
 
				+#else
			
 
				 static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
@@ -139,6 +146,20 @@ static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 
				 	return config->combined_workers[workerid - nworkers].memory_node;
			
 
				 
			
 
				 }
			
 
				+#endif
			
 
				 #define starpu_worker_get_memory_node _starpu_worker_get_memory_node
			
 
				 
			
 
				+#if STARPU_MAXNODES == 1
			
 
				+#define _starpu_worker_get_local_memory_node() 0
			
 
				+#else
			
 
				+static inline unsigned _starpu_worker_get_local_memory_node(void)
			
 
				+{
			
 
				+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				+	if (!worker)
			
 
				+		return STARPU_MAIN_RAM;
			
 
				+	return worker->memory_node;
			
 
				+}
			
 
				+#endif
			
 
				+#define starpu_worker_get_local_memory_node _starpu_worker_get_local_memory_node
			
 
				+
			
 
				 #endif // __MEMORY_NODES_H__
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -280,12 +280,21 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
				 					redux_task->cl = handle->redux_cl;
			
 
				 					STARPU_ASSERT(redux_task->cl);
			
 
				 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
			
 
				-						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
			
 
				+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW|STARPU_COMMUTE, 0);
			
 
				 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
			
 
				 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
			
 
				 
			
 
				-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
			
 
				+					STARPU_ASSERT_MSG((STARPU_CODELET_GET_MODE(redux_task->cl, 0) & ~STARPU_COMMUTE) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
			
 
				 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
			
 
				+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0) & STARPU_COMMUTE))
			
 
				+					{
			
 
				+						static int warned;
			
 
				+						if (!warned)
			
 
				+						{
			
 
				+							warned = 1;
			
 
				+							_STARPU_DISP("Warning: for reductions, codelet %p should have STARPU_COMMUTE along STARPU_RW\n", redux_task->cl);
			
 
				+						}
			
 
				+					}
			
 
				 
			
 
				 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
			
 
				 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -53,7 +53,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
			
 
				+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, NULL, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
			
 
				 
			
 
				 	/* we do not increase the refcnt associated to the request since we are
			
 
				 	 * not waiting for its termination */
			
@@ -126,7 +126,7 @@ static inline void _starpu_data_acquire_launch_fetch(struct user_interaction_wra
 
				 	starpu_data_handle_t handle = wrapper->handle;
			
 
				 	struct _starpu_data_replicate *replicate = node >= 0 ? &handle->per_node[node] : NULL;
			
 
				 
			
 
				-	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
			
 
				+	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, NULL, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
			
 
				 	STARPU_ASSERT(!ret);
			
 
				 }
			
 
				 
			
@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
				 							  void (*callback)(void *arg),
			
 
				 							  void *arg,
			
 
				 							  int sequential_consistency, int quick,
			
 
				-							  long *pre_sync_jobid, long *post_sync_jobid)
			
 
				+							  long *pre_sync_jobid, long *post_sync_jobid, int prio)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
			
@@ -211,6 +211,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
				 	wrapper->callback_arg = arg;
			
 
				 	wrapper->pre_sync_task = NULL;
			
 
				 	wrapper->post_sync_task = NULL;
			
 
				+	wrapper->prio = prio;
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				 	int handle_sequential_consistency = handle->sequential_consistency;
			
@@ -225,6 +226,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
				 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
			
 
				 		wrapper->pre_sync_task->callback_arg = wrapper;
			
 
				 		wrapper->pre_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
			
 
				+		wrapper->pre_sync_task->priority = prio;
			
 
				 		pre_sync_job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
			
 
				 		if (pre_sync_jobid)
			
 
				 			*pre_sync_jobid = pre_sync_job->job_id;
			
@@ -233,6 +235,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
				 		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
			
 
				 		wrapper->post_sync_task->detach = 1;
			
 
				 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
			
 
				+		wrapper->post_sync_task->priority = prio;
			
 
				 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
			
 
				 		if (post_sync_jobid)
			
 
				 			*post_sync_jobid = post_sync_job->job_id;
			
@@ -280,7 +283,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
				 							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
			
 
				 							  int sequential_consistency, int quick)
			
 
				 {
			
 
				-	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL);
			
 
				+	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL, STARPU_DEFAULT_PRIO);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node,
			
@@ -616,7 +619,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 
			
 
				 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, STARPU_DEFAULT_PRIO);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
@@ -626,7 +629,7 @@ int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node
 
				 
			
 
				 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				-	return starpu_data_prefetch_on_node_prio(handle, node, async, 0);
			
 
				+	return starpu_data_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
@@ -636,7 +639,7 @@ int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				-	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, 0);
			
 
				+	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
			
 
				 }
			
 
				 
			
 
				 static void _starpu_data_wont_use(void *data)
			
@@ -817,7 +820,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 
				 		unsigned node;
			
 
				 		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		{
			
 
				-			if (handle->per_node[memory_node].requested & (1UL << node))
			
 
				+			if (handle->per_node[memory_node].request[node])
			
 
				 			{
			
 
				 				requested = 1;
			
 
				 				break;
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 				{
			
 
				 					cpt++;
			
 
				-					__starpu_datawizard_progress(1, 1);
			
 
				+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 				}
			
 
				 				if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 					_starpu_spin_lock(&handle->header_lock);
			
@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 
			
 
				 				struct _starpu_data_request *r;
			
 
				 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				-									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				+									 STARPU_R, NULL, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				 
			
 
				 			        /* If no request was created, the handle was already up-to-date on the
			
 
				 			         * node */
			
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
			
 
				 
			
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
			
 
				 	}
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -251,11 +251,12 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 
				 		fprintf(tasks_file, "\n");
			
 
				 		fprintf(tasks_file, "Modes:");
			
 
				 		for (i = 0; i < task->ndata; i++)
			
 
				-			fprintf(tasks_file, " %s%s%s%s%s",
			
 
				+			fprintf(tasks_file, " %s%s%s%s%s%s",
			
 
				 				(task->data[i].mode & STARPU_R)?"R":"",
			
 
				 				(task->data[i].mode & STARPU_W)?"W":"",
			
 
				 				(task->data[i].mode & STARPU_SCRATCH)?"S":"",
			
 
				 				(task->data[i].mode & STARPU_REDUX)?"X":"",
			
 
				+				(task->data[i].mode & STARPU_MPI_REDUX)?"X-mpi":"",
			
 
				 				(task->data[i].mode & STARPU_COMMUTE)?"C":"");
			
 
				 		fprintf(tasks_file, "\n");
			
 
				 		fprintf(tasks_file, "Sizes:");
			
@@ -763,15 +764,20 @@ static void memnode_pop_state(double time, const char *prefix, unsigned int memn
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long info, unsigned long size, unsigned int dest, struct starpu_fxt_options *options)
			
 
				+static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long value, unsigned long info, long size_prio, unsigned int dest, struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	if (!options->memory_states)
			
 
				 		return;
			
 
				+	// If there is not a valid memory node, we cant associate it
			
 
				+	if((int)memnodeid < 0)
			
 
				+		return;
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 	char container[STARPU_POTI_STR_LEN];
			
 
				 	char p_handle[STARPU_POTI_STR_LEN];
			
 
				+	char p_value[STARPU_POTI_STR_LEN];
			
 
				 	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
			
 
				 	snprintf(p_handle, sizeof(p_handle), "%lx", handle);
			
 
				+	snprintf(p_value, sizeof(p_value), "%lx", value);
			
 
				 
			
 
				 #ifdef HAVE_POTI_USER_NEWEVENT
			
 
				 	char p_dest[STARPU_POTI_STR_LEN];
			
@@ -780,15 +786,15 @@ static void memnode_event(double time, const char *prefix, unsigned int memnodei
 
				 
			
 
				 	memmanager_container_alias(p_dest, STARPU_POTI_STR_LEN, prefix, dest);
			
 
				 	snprintf(p_info, sizeof(p_info), "%lu", info);
			
 
				-	snprintf(p_size, sizeof(p_size), "%lu", size);
			
 
				+	snprintf(p_size, sizeof(p_size), "%ld", size_prio);
			
 
				 
			
 
				-	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, "0", 4,
			
 
				+	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, p_value, 4,
			
 
				 			   p_handle, p_info, p_size, p_dest);
			
 
				 #else
			
 
				 	poti_NewEvent(time, container, name, p_handle);
			
 
				 #endif
			
 
				 #else
			
 
				-	fprintf(out_paje_file, "22    %.9f    %s %smm%u  0 %lx %lu %lu %smm%u\n", time, name, prefix, memnodeid, handle, info, size, prefix, dest);
			
 
				+	fprintf(out_paje_file, "22    %.9f    %s %smm%u  %lx %lx %lu %ld %smm%u\n", time, name, prefix, memnodeid, value, handle, info, size_prio, prefix, dest);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -2232,7 +2238,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 		{
			
 
				 			double time = get_event_time_stamp(ev, options);
			
 
				 			memnode_push_state(time, prefix, dst, "Co");
			
 
				-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, comid, size, src, options);
			
 
				+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, 0, comid, size, src, options);
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_memnode_container[STARPU_POTI_STR_LEN];
			
 
				 			char program_container[STARPU_POTI_STR_LEN];
			
@@ -2351,7 +2357,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 		{
			
 
				 			double time = get_event_time_stamp(ev, options);
			
 
				 			memnode_pop_state(time, prefix, dst);
			
 
				-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, comid, size, src, options);
			
 
				+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, 0, comid, size, src, options);
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
			
 
				 			char dst_memnode_container[STARPU_POTI_STR_LEN], program_container[STARPU_POTI_STR_LEN];
			
@@ -2378,7 +2384,7 @@ static void handle_start_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_f
 
				 		if (out_paje_file)
			
 
				 		{
			
 
				 			memnode_push_state(get_event_time_stamp(ev, options), prefix, dst, "CoA");
			
 
				-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, src, options);
			
 
				+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, 0, src, options);
			
 
				 		}
			
 
				 
			
 
				 }
			
@@ -2394,7 +2400,7 @@ static void handle_end_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt
 
				 		if (out_paje_file)
			
 
				 		{
			
 
				 			memnode_pop_state(get_event_time_stamp(ev, options), prefix, dst);
			
 
				-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, src, options);
			
 
				+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, 0, src, options);
			
 
				 		}
			
 
				 }
			
 
				 
			
@@ -2408,32 +2414,36 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
			
 
				 }
			
 
				 
			
 
				+static void handle_data_request(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				+{
			
 
				+	unsigned memnode = ev->param[0];
			
 
				+	unsigned dest = ev->param[1];
			
 
				+	unsigned prio = ev->param[2];
			
 
				+	unsigned long handle = ev->param[3];
			
 
				+	unsigned prefe = ev->param[4];
			
 
				+	unsigned long request = ev->param[5];
			
 
				+
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, request, prefe, prio, dest, options);
			
 
				+}
			
 
				+
			
 
				 static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				 {
			
 
				 	unsigned memnode = ev->param[0];
			
 
				 	unsigned size = ev->param[2];
			
 
				 	unsigned long handle = ev->param[3];
			
 
				 
			
 
				-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, size, memnode, options);
			
 
				 }
			
 
				 
			
 
				 static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
 
				 {
			
 
				 	unsigned memnode = ev->param[0];
			
 
				-	unsigned dest = ev->param[1];
			
 
				-	if(strcmp(eventstr, "rc")==0)
			
 
				-	{
			
 
				-		//If it is a Request Create, use dest normally
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		dest = memnode;
			
 
				-	}
			
 
				+	//unsigned dest = ev->param[1]; // Not used
			
 
				 	unsigned size = ev->param[2];
			
 
				 	unsigned long handle = ev->param[3];
			
 
				 	unsigned prefe = ev->param[4];
			
 
				 
			
 
				-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, prefe, size, memnode, options);
			
 
				 }
			
 
				 
			
 
				 static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
@@ -2442,7 +2452,7 @@ static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_o
 
				 	unsigned long handle = ev->param[2];
			
 
				 	unsigned info = ev->param[3];
			
 
				 
			
 
				-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, info, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				 static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
@@ -2450,7 +2460,7 @@ static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt
 
				 	unsigned memnode = ev->param[0];
			
 
				 	unsigned long handle = ev->param[2];
			
 
				 
			
 
				-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				 static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
@@ -2458,7 +2468,7 @@ static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_o
 
				 	unsigned memnode = ev->param[0];
			
 
				 	unsigned long handle = ev->param[2];
			
 
				 
			
 
				-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
			
 
				+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
			
 
				 }
			
 
				 
			
 
				 static void handle_push_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
			
@@ -3702,13 +3712,12 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 				if (options->memory_states)
			
 
				 					handle_data_state(&ev, options, "SS");
			
 
				 				break;
			
 
				-                       case _STARPU_FUT_DATA_REQUEST_CREATED:
			
 
				-                               if (!options->no_bus && options->memory_states)
			
 
				-                               {
			
 
				-                                       handle_memnode_event_start_4(&ev, options, "rc");
			
 
				-                               }
			
 
				-                               break;
			
 
				-
			
 
				+			case _STARPU_FUT_DATA_REQUEST_CREATED:
			
 
				+				if (!options->no_bus && options->memory_states)
			
 
				+				{
			
 
				+					handle_data_request(&ev, options, "rc");
			
 
				+				}
			
 
				+				break;
			
 
				 		  case _STARPU_FUT_PAPI_TASK_EVENT_VALUE:
			
 
				 				handle_papi_event(&ev, options);
			
 
				 				break;
			
@@ -4207,18 +4216,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				 		free(options->worker_archtypes[i].devices);
			
 
				 
			
 
				-	struct _starpu_symbol_name *itor, *next;
			
 
				-	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
			
 
				-		itor != _starpu_symbol_name_list_end(&symbol_list);
			
 
				-		itor = next)
			
 
				-	{
			
 
				-		next = _starpu_symbol_name_list_next(itor);
			
 
				-
			
 
				-		_starpu_symbol_name_list_erase(&symbol_list, itor);
			
 
				-		free(itor->name);
			
 
				-		_starpu_symbol_name_delete(itor);
			
 
				-	}
			
 
				-
			
 
				 	_starpu_fxt_component_deinit();
			
 
				 
			
 
				 	free_worker_ids();
			
@@ -4608,6 +4605,17 @@ void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 
				 static
			
 
				 void _starpu_fxt_paje_file_close(void)
			
 
				 {
			
 
				+	struct _starpu_symbol_name *itor, *next;
			
 
				+	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
			
 
				+		itor != _starpu_symbol_name_list_end(&symbol_list);
			
 
				+		itor = next)
			
 
				+	{
			
 
				+		next = _starpu_symbol_name_list_next(itor);
			
 
				+
			
 
				+		_starpu_symbol_name_list_erase(&symbol_list, itor);
			
 
				+		free(itor->name);
			
 
				+		_starpu_symbol_name_delete(itor);
			
 
				+	}
			
 
				 	if (out_paje_file)
			
 
				 		fclose(out_paje_file);
			
 
				 }
			
@@ -4658,6 +4666,7 @@ uint64_t _starpu_fxt_find_start_time(char *filename_in)
 
				 
			
 
				 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
			
 
				 {
			
 
				+	starpu_drivers_preinit();
			
 
				 	_starpu_fxt_options_set_dir(options);
			
 
				 	_starpu_fxt_dag_init(options->dag_path);
			
 
				 	_starpu_fxt_distrib_file_init(options);
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -40,6 +40,7 @@
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <datawizard/malloc.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				 #include <core/simgrid.h>
			
 
				 #include <core/task.h>
			
 
				 #include <core/disk.h>
			
@@ -341,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				-	res = __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	if (!pending_task)
			
 
				 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
			
@@ -429,7 +430,7 @@ int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
 
				 	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				 	unsigned memnode = cpu_worker->memory_node;
			
 
				-	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				 	/* In case there remains some memory that was automatically
			
 
				 	 * allocated by StarPU, we release it now. Note that data
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -37,6 +37,7 @@
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <datawizard/malloc.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				 #include <core/task.h>
			
 
				 #include <common/knobs.h>
			
 
				 
			
@@ -935,14 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 	if (!idle_tasks)
			
 
				 	{
			
 
				 		/* No task ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(1, !idle_transfers);
			
 
				+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	/* Something done, make some progress */
			
 
				-	res = !idle_tasks || !idle_transfers;
			
 
				-	res |= __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	/* And pull tasks */
			
 
				 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
			
@@ -950,9 +950,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 #ifdef STARPU_SIMGRID
			
 
				 	if (!res)
			
 
				 		starpu_pthread_wait_wait(&worker0->wait);
			
 
				-#else
			
 
				-	if (!res)
			
 
				-		return 0;
			
 
				 #endif
			
 
				 
			
 
				 	for (i = 0; i < (int) worker_set->nworkers; i++)
			
@@ -972,35 +969,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		{
			
 
				 			/* this is neither a cuda or a cublas task */
			
 
				 			_starpu_worker_refuse_task(worker, task);
			
 
				-#if 0
			
 
				-			if (worker->pipeline_length)
			
 
				-			{
			
 
				-				int j;
			
 
				-				for (j = 0; j < worker->ntasks; j++)
			
 
				-				{
			
 
				-					const int j_mod = (j+worker->first_task)%STARPU_MAX_PIPELINE;
			
 
				-					if (task == worker->current_tasks[j_mod])
			
 
				-					{
			
 
				-						worker->current_tasks[j_mod] = NULL;
			
 
				-						if (j == 0)
			
 
				-						{
			
 
				-							worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
			
 
				-							_starpu_set_current_task(NULL);
			
 
				-						}
			
 
				-						break;
			
 
				-					}
			
 
				-				}
			
 
				-				STARPU_ASSERT(j<worker->ntasks);
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				worker->current_task = NULL;
			
 
				-				_starpu_set_current_task(NULL);
			
 
				-			}
			
 
				-			worker->ntasks--;
			
 
				-			int res = _starpu_push_task_to_workers(task);
			
 
				-			STARPU_ASSERT_MSG(res == 0, "_starpu_push_task_to_workers() unexpectedly returned = %d\n", res);
			
 
				-#endif
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
@@ -1039,7 +1007,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 
				 		if (!usersleft)
			
 
				                 {
			
 
				 			/* I'm last, deinitialize device */
			
 
				-			_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+			_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				 			/* In case there remains some memory that was automatically
			
 
				 			 * allocated by StarPU, we release it now. Note that data
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-        res |= __starpu_datawizard_progress(1, 1);
			
 
				+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				         /* Handle message which have been store */
			
 
				         _starpu_src_common_handle_stored_async(mp_node);
			
@@ -1075,7 +1075,7 @@ void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set, int
 
				         for (device = 0; device < ndevices; device++)
			
 
				 	{
			
 
				         	_STARPU_TRACE_END_PROGRESS(memnode[device]);
			
 
				-                _starpu_handle_all_pending_node_data_requests(memnode[device]);
			
 
				+                _starpu_datawizard_handle_all_pending_node_data_requests(memnode[device]);
			
 
				 	}
			
 
				 
			
 
				         /* In case there remains some memory that was automatically
			
@@ -1107,7 +1107,7 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, unsigned
 
				 
			
 
				         _STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				-        _starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+        _starpu_datawizard_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				         /* In case there remains some memory that was automatically
			
 
				          * allocated by StarPU, we release it now. Note that data
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -31,6 +31,7 @@
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <datawizard/malloc.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				 #include <core/task.h>
			
 
				 #include <common/knobs.h>
			
 
				 
			
@@ -787,13 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
				 	if (!idle_tasks)
			
 
				 	{
			
 
				 		/* No task ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(1, !idle_transfers);
			
 
				+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	res = !idle_tasks || !idle_transfers;
			
 
				-	res |= __starpu_datawizard_progress(1, 1);
			
 
				+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
			
 
				 
			
 
				 	task = _starpu_get_worker_task(worker, workerid, memnode);
			
 
				 
			
@@ -840,7 +840,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 
				 
			
 
				 	unsigned memnode = worker->memory_node;
			
 
				 
			
 
				-	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				 	/* In case there remains some memory that was automatically
			
 
				 	 * allocated by StarPU, we release it now. Note that data
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -114,6 +114,9 @@ int starpu_profiling_status_set(int status)
 
				 	{
			
 
				 		struct _starpu_worker *worker_struct = _starpu_get_worker_struct(worker);
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&worker_struct->sched_mutex);
			
 
				+	}
			
 
				+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[worker]);
			
 
				 	}
			
 
				 
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -112,7 +112,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 
				 	}
			
 
				 	if(task)
			
 
				 		/* this worker can execute this task as it was returned by a pop*/
			
 
				-		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id_check());
			
 
				+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_bitmap_first(&component->workers_in_ctx));
			
 
				 	return task;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/component_fifo.c
+++ b/src/sched_policies/component_fifo.c
@@ -180,8 +180,10 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 
				 	struct starpu_task * task;
			
 
				 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
			
 
				 		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
			
 
				+	else if (to->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
			
 
				+		task = _starpu_fifo_pop_task(queue, starpu_bitmap_first(&to->workers_in_ctx));
			
 
				 	else
			
 
				-		task = _starpu_fifo_pop_task(queue, starpu_worker_get_id_check());
			
 
				+		task = _starpu_fifo_pop_task(queue, -1);
			
 
				 	if(task && data->exp)
			
 
				 	{
			
 
				 		if(!isnan(task->predicted))
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -443,8 +443,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
				 		if(task)
			
 
				 		{
			
 
				 			_starpu_worker_task_list_transfer_started(list, task);
			
 
				-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 			starpu_push_task_end(task);
			
 
				+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 			goto ret;
			
 
				 		}
			
 
				 		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
@@ -470,8 +470,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
				 			STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
			
 
				 			_starpu_worker_task_list_add(list, task);
			
 
				 			_starpu_worker_task_list_transfer_started(list, task);
			
 
				-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 			starpu_push_task_end(task);
			
 
				+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 			goto ret;
			
 
				 		}
			
 
				 		struct starpu_sched_component * combined_worker_component = starpu_sched_component_worker_get(component->tree->sched_ctx_id, workerid);
			
@@ -486,8 +486,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
				 		STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
			
 
				 		_starpu_worker_task_list_add(list, task);
			
 
				 		_starpu_worker_task_list_transfer_started(list, task);
			
 
				-		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 		starpu_push_task_end(task);
			
 
				+		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
			
 
				 	}
			
 
				 ret:
			
 
				 	return task;
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -352,6 +352,29 @@ int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_
 
				 	return ((num_priorities-1)/(max-min)) * (priority - min);
			
 
				 }
			
 
				 
			
 
				+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker)
			
 
				+{
			
 
				+	size_t cnt = 0;
			
 
				+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				+	unsigned index;
			
 
				+
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		starpu_data_handle_t handle;
			
 
				+		unsigned buffer_node = _starpu_task_data_get_node_on_worker(task, index, worker);
			
 
				+
			
 
				+		handle = STARPU_TASK_GET_HANDLE(task, index);
			
 
				+
			
 
				+		int is_valid;
			
 
				+		starpu_data_query_status(handle, buffer_node, NULL, &is_valid, NULL);
			
 
				+
			
 
				+		if (!is_valid)
			
 
				+			cnt+=starpu_data_get_size(handle);
			
 
				+	}
			
 
				+
			
 
				+	return cnt;
			
 
				+}
			
 
				+
			
 
				 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker)
			
 
				 {
			
 
				 	int cnt = 0;
			
@@ -392,7 +415,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
				 
			
 
				 		int first_task_priority = task->priority;
			
 
				 
			
 
				-		int non_ready_best = INT_MAX;
			
 
				+		size_t non_ready_best = SIZE_MAX;
			
 
				 
			
 
				 		for (current = task; current; current = current->next)
			
 
				 		{
			
@@ -400,7 +423,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
				 
			
 
				 			if (priority >= first_task_priority)
			
 
				 			{
			
 
				-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
			
 
				+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
			
 
				 				if (non_ready < non_ready_best)
			
 
				 				{
			
 
				 					non_ready_best = non_ready;
			
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -69,6 +69,7 @@ struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo)
 
				 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, int workerid);
			
 
				 int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_id);
			
 
				 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker);
			
 
				+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker);
			
 
				 struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned workerid, int num_priorities);
			
 
				 
			
 
				 #endif // __FIFO_QUEUES_H__
			
--- a/src/sched_policies/prio_deque.c
+++ b/src/sched_policies/prio_deque.c
@@ -94,7 +94,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
				 			return NULL;
			
 
				 
			
 
				 		int first_task_priority = task->priority;
			
 
				-		int non_ready_best = INT_MAX;
			
 
				+		size_t non_ready_best = SIZE_MAX;
			
 
				 
			
 
				 		for (current = starpu_task_prio_list_begin(&pdeque->list);
			
 
				 		     current != starpu_task_prio_list_end(&pdeque->list);
			
@@ -104,7 +104,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
				 
			
 
				 			if (priority >= first_task_priority)
			
 
				 			{
			
 
				-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
			
 
				+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
			
 
				 				if (non_ready < non_ready_best)
			
 
				 				{
			
 
				 					non_ready_best = non_ready;
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -610,6 +610,11 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
				 	if (_starpu_worker_trylock(victim))
			
 
				 	{
			
 
				 		/* victim is busy, don't bother it, come back later */
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+		starpu_sleep(0.000001);
			
 
				+		/* Make sure we come back and not block */
			
 
				+		starpu_wake_worker_no_relax(workerid);
			
 
				+#endif
			
 
				 		return NULL;
			
 
				 	}
			
 
				 	if (ws->per_worker[victim].running && ws->per_worker[victim].queue.ntasks > 0)
			
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c