преди 5 години · 83bc792574
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,7 +30,7 @@ build:
 
																       when: never  # Prevent pipeline run for push event
															
 
																     - when: always # Run pipeline for all other cases
															
 
																-deploy:
															
 
																+check:
															
 
																   stage: deploy
															
 
																   script:
															
 
																     - ./contrib/gitlab/deploy.sh
															
@@ -38,3 +38,12 @@ deploy:
 
																     - if: '$CI_PIPELINE_SOURCE == "push"'
															
 
																       when: never  # Prevent pipeline run for push event
															
 
																     - when: always # Run pipeline for all other cases
															
 
																+
															
 
																+simgrid:
															
 
																+  stage: deploy
															
 
																+  script:
															
 
																+    - ./contrib/gitlab/simgrid.sh
															
 
																+  rules:
															
 
																+    - if: '$CI_PIPELINE_SOURCE == "push"'
															
 
																+      when: never  # Prevent pipeline run for push event
															
 
																+    - when: always # Run pipeline for all other cases
															
--- a/AUTHORS
+++ b/AUTHORS
@@ -17,6 +17,7 @@ Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 
																 He Kun, Inria, <kun.he@inria.fr>
															
 
																 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
															
 
																 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
															
 
																+Jego Antoine, Enseeiht, <antoine.jego@etu.enseeiht.fr>
															
 
																 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
															
 
																 Juven Alexis, Inria, <alexis.juven@inria.fr>
															
 
																 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>
															
--- a/ChangeLog
+++ b/ChangeLog
@@ -51,9 +51,11 @@ New features:
 
																     starpu_mpi_interface_datatype_node_register which will be needed for
															
 
																     MPI/NUMA/GPUDirect.
															
 
																   * Add peek_data interface method.
															
 
																+  * Add STARPU_MPI_REDUX
															
 
																 Small changes:
															
 
																   * Add a synthetic energy efficiency testcase.
															
 
																+  * Make reduction methods want the commute flag.
															
 
																 StarPU 1.3.8
															
 
																 ====================================================================
															
@@ -67,6 +69,7 @@ Small features:
 
																     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
															
 
																     instead of hyperthreads.
															
 
																   * New STARPU_TASK_PROGRESS environment variable to show task progression.
															
 
																+  * Add STARPU_SIMGRID environment variable guard against native builds.
															
 
																 StarPU 1.3.7
															
 
																 ====================================================================
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -53,9 +53,11 @@ if STARPU_BUILD_STARPURM
 
																 SUBDIRS += starpurm
															
 
																 endif
															
 
																+if STARPU_USE_CPU
															
 
																 if STARPU_BUILD_STARPUPY
															
 
																 SUBDIRS += starpupy
															
 
																 endif
															
 
																+endif
															
 
																 if STARPU_BUILD_SC_HYPERVISOR
															
 
																 SUBDIRS += sc_hypervisor
															
--- a/configure.ac
+++ b/configure.ac
@@ -167,9 +167,8 @@ if test x$enable_simgrid = xyes ; then
 
																 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
															
 
																 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
															
 
																 	fi
															
 
																-	if test -n "$SIMGRID_LIBS" ; then
															
 
																-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
															
 
																-	fi
															
 
																+	SAVED_LIBS="${LIBS}"
															
 
																+	LIBS="$SIMGRID_LIBS $LIBS"
															
 
																 	AC_HAVE_LIBRARY([simgrid], [],
															
 
																 		[
															
 
																 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
															
@@ -207,6 +206,7 @@ if test x$enable_simgrid = xyes ; then
 
																 	# Oldies for compatibility with older simgrid
															
 
																 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
															
 
																+	LIBS="${SAVED_LIBS}"
															
 
																 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
															
 
																 	# We won't bind or detect anything
															
@@ -225,6 +225,7 @@ if test x$enable_simgrid = xyes ; then
 
																 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
															
 
																 		LIBS="$LIBS -lstdc++"
															
 
																 	fi
															
 
																+	SIMGRID_LDFLAGS="$SIMGRID_LIBS -lsimgrid"
															
 
																 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
															
 
																 	case \ $CXXFLAGS\  in
															
@@ -267,13 +268,13 @@ if test x$enable_simgrid = xyes ; then
 
																 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
															
 
																 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
															
 
																 		# libsimgrid needs to be linked from binaries themselves for MC to work
															
 
																-		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
															
 
																+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $SIMGRID_LDFLAGS"
															
 
																 	fi
															
 
																 fi
															
 
																 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
															
 
																 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
															
 
																 AC_SUBST(SIMGRID_CFLAGS)
															
 
																-AC_SUBST(SIMGRID_LIBS)
															
 
																+AC_SUBST(SIMGRID_LDFLAGS)
															
 
																 AC_MSG_CHECKING(whether SimGrid is enabled)
															
 
																 AC_MSG_RESULT($enable_simgrid)
															
@@ -2304,9 +2305,6 @@ if test x$maxnodes = x0 ; then
 
																 	if test x$enable_simgrid = xyes ; then
															
 
																 		# We need the room for the virtual CUDA/OpenCL devices
															
 
																 		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
															
 
																-		if test $nodes -gt 32 ; then
															
 
																-			nodes=32
															
 
																-		fi
															
 
																 	else
															
 
																 		# We have one memory node shared by all CPU workers, one node per GPU
															
 
																 		# and per MIC device
															
@@ -2342,8 +2340,7 @@ if test x$maxnodes = x0 ; then
 
																 	done
															
 
																 fi
															
 
																 if test $maxnodes -gt 32 ; then
															
 
																-	# FIXME: at least use uint64 so we can have 64 memory nodes
															
 
																-	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
															
 
																+	AC_MSG_WARN([Note: the wt_mask feature only supports 32 memory nodes])
															
 
																 fi
															
 
																 AC_MSG_CHECKING(maximum number of memory nodes)
															
@@ -3448,6 +3445,14 @@ then
 
																 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
															
 
																 	fi
															
 
																 	AC_SUBST(PYTHON)
															
 
																+	PYTHON_INCLUDE_DIRS="`$PYTHON -c "from sysconfig import get_paths as gp; print(gp()@<:@'include'@:>@)"`"
															
 
																+	SAVED_CPPFLAGS="${CPPFLAGS}"
															
 
																+	CPPFLAGS="$CPPFLAGS -I$PYTHON_INCLUDE_DIRS"
															
 
																+	AC_CHECK_HEADERS([Python.h],[have_python_h=yes],[have_python_h=no])
															
 
																+	if test "$have_python_h" = "no" ; then
															
 
																+		AC_MSG_ERROR([Python.h missing, cannot build StarPU python interface (consider installing python-dev)])
															
 
																+	fi
															
 
																+	CPPFLAGS=${SAVED_CPPFLAGS}
															
 
																 	AC_MSG_CHECKING(for python3 module joblib)
															
 
																 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
															
 
																 	AC_MSG_RESULT($joblib_avail)
															
@@ -3565,7 +3570,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 
																 AC_SUBST([STARPU_H_CPPFLAGS])
															
 
																 # these are the flags needed for linking libstarpu (and thus also for static linking)
															
 
																-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
															
 
																+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
															
 
																 AC_SUBST([LIBSTARPU_LDFLAGS])
															
 
																 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
															
@@ -3805,11 +3810,11 @@ AC_MSG_NOTICE([
 
																 	       OpenMP runtime support enabled:                $enable_openmp
															
 
																 	       Cluster support enabled:                       $enable_cluster
															
 
																 	       SOCL enabled:                                  $build_socl
															
 
																-               SOCL test suite:                               $run_socl_check
															
 
																-               Scheduler Hypervisor:                          $build_sc_hypervisor
															
 
																-               simgrid enabled:                               $enable_simgrid
															
 
																-               ayudame enabled:                               $ayu_msg
															
 
																-               HDF5 enabled:                                  $enable_hdf5
															
 
																+	       SOCL test suite:                               $run_socl_check
															
 
																+	       Scheduler Hypervisor:                          $build_sc_hypervisor
															
 
																+	       simgrid enabled:                               $enable_simgrid
															
 
																+	       ayudame enabled:                               $ayu_msg
															
 
																+	       HDF5 enabled:                                  $enable_hdf5
															
 
																 	       Native fortran support:                        $enable_build_fortran
															
 
																 	       Native MPI fortran support:                    $use_mpi_fort
															
 
																 	       Support for multiple linear regression models: $support_mlr
															
--- a/contrib/ci.inria.fr/job-1-check.sh
+++ b/contrib/ci.inria.fr/job-1-check.sh
@@ -37,7 +37,11 @@ basename=$(basename $tarball .tar.gz)
 
																 export STARPU_HOME=$PWD/$basename/home
															
 
																 mkdir -p $basename
															
 
																 cd $basename
															
 
																-env > $PWD/env
															
 
																+(
															
 
																+    echo "oldPWD=\${PWD}"
															
 
																+    env|grep -v LS_COLORS | grep '^[A-Z]'|grep -v BASH_FUNC | grep '=' | sed 's/=/=\"/'| sed 's/$/\"/' | sed 's/^/export /'
															
 
																+    echo "cd \$oldPWD"
															
 
																+) > ${PWD}/env
															
 
																 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
															
 
																 tar xfz ../$tarball
															
@@ -63,7 +67,17 @@ fi
 
																 export CC=gcc
															
 
																-CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
															
 
																+set +e
															
 
																+mpiexec -oversubscribe pwd 2>/dev/null
															
 
																+ret=$?
															
 
																+set -e
															
 
																+ARGS=""
															
 
																+if test "$ret" = "0"
															
 
																+then
															
 
																+    ARGS="--with-mpiexec-args=-oversubscribe"
															
 
																+fi
															
 
																+
															
 
																+CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc $ARGS"
															
 
																 CONFIGURE_CHECK=""
															
 
																 day=$(date +%u)
															
 
																 if test $day -le 5
															
@@ -72,10 +86,11 @@ then
 
																 #else
															
 
																     # we do a normal check, a long check takes too long on VM nodes
															
 
																 fi
															
 
																-../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
															
 
																+../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS $STARPU_USER_CONFIGURE_OPTIONS
															
 
																 export STARPU_TIMEOUT_ENV=1800
															
 
																 export MPIEXEC_TIMEOUT=1800
															
 
																+
															
 
																 make
															
 
																 #make check
															
 
																 (make -k check || true) 2>&1 | tee  ../check_$$
															
--- a/contrib/gitlab/simgrid.sh
+++ b/contrib/gitlab/simgrid.sh
@@ -0,0 +1,22 @@
 
																+#!/bin/sh
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																+
															
 
																+STARPU_USER_CONFIGURE_OPTIONS="--enable-simgrid --disable-mpi --disable-mpi-check" ./contrib/ci.inria.fr/job-1-check.sh
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -520,7 +520,7 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 
																 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
															
 
																 using the SimGrid version of StarPU: first install the SimGrid simulator from
															
 
																 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
															
 
																-3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
															
 
																+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with \c -Denable_msg=ON.
															
 
																 Other versions may have compatibility issues, 3.17 notably does
															
 
																 not build at all. MPI simulation does not work with version 3.22).
															
 
																 Then configure StarPU with \ref enable-simgrid
															
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -643,7 +643,8 @@ struct starpu_codelet accumulate_variable_cl =
 
																         .cpu_funcs = { accumulate_variable_cpu },
															
 
																         .cpu_funcs_name = { "accumulate_variable_cpu" },
															
 
																         .cuda_funcs = { accumulate_variable_cuda },
															
 
																-        .nbuffers = 1,
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 }
															
 
																 \endcode
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -515,12 +515,12 @@ The <c>-f</c> option can also be used to display the performance in terms of GFl
 
																 \verbatim
															
 
																 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
															
 
																-$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
															
 
																-$ gv starpu_non_linear_memset_regression_based_energy.eps
															
 
																+$ gnuplot starpu_gflops_non_linear_memset_regression_based_energy.gp
															
 
																+$ gv starpu_gflops_non_linear_memset_regression_based_energy.eps
															
 
																 \endverbatim
															
 
																-\image html starpu_non_linear_memset_regression_based_energy_flops.png
															
 
																-\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
															
 
																+\image html starpu_gflops_non_linear_memset_regression_based_energy.png
															
 
																+\image latex starpu_gflops_non_linear_memset_regression_based_energy.eps "" width=\textwidth
															
 
																 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
															
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -744,6 +744,37 @@ starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
 
																 The data can then be used just like pernode above.
															
 
																+\section MPIMpiRedux Inter-node reduction
															
 
																+
															
 
																+One might want to leverage a reduction pattern across several nodes.
															
 
																+Using \c STARPU_REDUX, one can obtain reduction patterns across several nodes,
															
 
																+however each core across the contributing nodes will spawn their own
															
 
																+contribution to work with. In the case that these allocations or the
															
 
																+required reductions are too expensive to execute for each contribution,
															
 
																+the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution 
															
 
																+on node executing tasks partaking in the reduction.
															
 
																+
															
 
																+Tasks producing a result in the inter-node reduction should be registered as
															
 
																+accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode.
															
 
																+
															
 
																+\code{.c}
															
 
																+static struct starpu_codelet contrib_cl =
															
 
																+{
															
 
																+	.cpu_funcs = {cpu_contrib}, /* cpu implementation(s) of the routine */
															
 
																+	.nbuffers = 1, /* number of data handles referenced by this routine */
															
 
																+	.modes = {STARPU_RW | STARPU_COMMUTE} /* access modes for the contribution */
															
 
																+	.name = "contribution"
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+When inserting these tasks, the access mode handed out to the StarPU-MPI layer
															
 
																+should be \c STARPU_MPI_REDUX. Assuming \c data is owned by node 0 and we want node
															
 
																+1 to compute the contribution, we could do the following.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, EXECUTE_ON_NODE, 1); /* Node 1 computes it */
															
 
																+\endcode
															
 
																+
															
 
																 \section MPIPriorities Priorities
															
 
																 All send functions have a <c>_prio</c> variant which takes an additional
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -473,6 +473,16 @@ todo
 
																 todo
															
 
																 </dd>
															
 
																+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
															
 
																+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
															
 
																+Specify if CUDA workers should do only fast allocations
															
 
																+when running the datawizard progress of
															
 
																+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
															
 
																+Default value is 0, allowing CUDA workers to do slow allocations.
															
 
																+</dd>
															
 
																+
															
 
																 </dl>
															
 
																 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
															
@@ -738,6 +748,27 @@ block when the memory allocation required for network reception overflows the
 
																 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
															
 
																 </dd>
															
 
																+<dt>STARPU_MPI_EARLYDATA_ALLOCATE</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_MPI_EARLYDATA_ALLOCATE
															
 
																+\addindex __env__STARPU_MPI_EARLYDATA_ALLOCATE
															
 
																+When set to 1, the MPI Driver will immediately allocate the data for early
															
 
																+requests instead of issuing a data request and blocking. The default value is 0,
															
 
																+issuing a data request. Because it is an early request and we do not know its
															
 
																+real priority, the data request will assume \ref STARPU_DEFAULT_PRIO. In cases
															
 
																+where there are many data requests with priorities greater than
															
 
																+\ref STARPU_DEFAULT_PRIO the MPI drive could be blocked for long periods.
															
 
																+</dd>
															
 
																+
															
 
																+<dt>STARPU_SIMGRID</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_SIMGRID
															
 
																+\addindex __env__STARPU_SIMGRID
															
 
																+When set to 1 (the default is 0), this makes StarPU check that it was really
															
 
																+build with simulation support. This is convenient in scripts to avoid using a
															
 
																+native version, that would try to update performance models...
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_SIMGRID_TRANSFER_COST
															
--- a/doc/doxygen/chapters/code/disk_copy.c
+++ b/doc/doxygen/chapters/code/disk_copy.c
@@ -33,7 +33,7 @@
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	double * A,*B,*C,*D,*E,*F;
															
 
																+	double *A, *F;
															
 
																 	/* limit main ram to force to push in disk */
															
 
																 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);
															
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
--- a/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
+++ b/doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -106,6 +106,7 @@ examplebin_PROGRAMS =
 
																 noinst_HEADERS = 				\
															
 
																 	axpy/axpy.h                             \
															
 
																 	cg/cg.h					\
															
 
																+	cg/cg_kernels.c				\
															
 
																 	heat/lu_kernels_model.h			\
															
 
																 	heat/dw_sparse_cg.h			\
															
 
																 	heat/heat.h				\
															
@@ -869,7 +870,6 @@ if !STARPU_NO_BLAS_LIB
 
																 cg_cg_SOURCES =					\
															
 
																 	cg/cg.c					\
															
 
																-	cg/cg_kernels.c				\
															
 
																 	common/blas.c
															
 
																 cg_cg_LDADD =					\
															
--- a/examples/basic_examples/multiformat_conversion_codelets.c
+++ b/examples/basic_examples/multiformat_conversion_codelets.c
@@ -41,6 +41,7 @@ struct starpu_codelet cpu_to_cuda_cl =
 
																 	.cuda_funcs = {cpu_to_cuda_cuda_func},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																 	.name = "codelet_cpu_to_cuda"
															
 
																 };
															
@@ -48,6 +49,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 
																 {
															
 
																 	.cpu_funcs = {cuda_to_cpu},
															
 
																 	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																 	.name = "codelet_cude_to_cpu"
															
 
																 };
															
 
																 #endif
															
@@ -73,12 +75,14 @@ struct starpu_codelet cpu_to_opencl_cl =
 
																 {
															
 
																 	.opencl_funcs = {cpu_to_opencl_opencl_func},
															
 
																 	.opencl_flags = {STARPU_OPENCL_ASYNC},
															
 
																-	.nbuffers = 1
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																 };
															
 
																 struct starpu_codelet opencl_to_cpu_cl =
															
 
																 {
															
 
																 	.cpu_funcs = {opencl_to_cpu},
															
 
																-	.nbuffers = 1
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																 };
															
 
																 #endif
															
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -19,11 +19,6 @@
 
																 #include <starpu.h>
															
 
																 #include <common/blas.h>
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-#include <cuda.h>
															
 
																-#endif
															
 
																-
															
 
																-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																 /*
															
 
																  *	Conjugate Gradient
															
@@ -68,32 +63,34 @@
 
																 #include "cg.h"
															
 
																-static int long long n = 4096;
															
 
																-static int nblocks = 8;
															
 
																-static int use_reduction = 1;
															
 
																+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks);
															
 
																-static starpu_data_handle_t A_handle, b_handle, x_handle;
															
 
																-static TYPE *A, *b, *x;
															
 
																+#define HANDLE_TYPE_VECTOR starpu_data_handle_t
															
 
																+#define HANDLE_TYPE_MATRIX starpu_data_handle_t
															
 
																+#define TASK_INSERT(cl, ...) starpu_task_insert(cl, ##__VA_ARGS__)
															
 
																+#define GET_VECTOR_BLOCK(v, i) starpu_data_get_sub_data(v, 1, i)
															
 
																+#define GET_MATRIX_BLOCK(m, i, j) starpu_data_get_sub_data(m, 2, i, j)
															
 
																+#define BARRIER()
															
 
																+#define GET_DATA_HANDLE(handle)
															
 
																+#define FPRINTF_SERVER FPRINTF
															
 
																+
															
 
																+#include "cg_kernels.c"
															
 
																-#ifdef STARPU_QUICK_CHECK
															
 
																-static int i_max = 5;
															
 
																-#elif !defined(STARPU_LONG_CHECK)
															
 
																-static int i_max = 100;
															
 
																-#else
															
 
																-static int i_max = 1000;
															
 
																-#endif
															
 
																-static double eps = (10e-14);
															
 
																-static starpu_data_handle_t r_handle, d_handle, q_handle;
															
 
																+
															
 
																+static TYPE *A, *b, *x;
															
 
																 static TYPE *r, *d, *q;
															
 
																-static starpu_data_handle_t dtq_handle, rtr_handle;
															
 
																-static TYPE dtq, rtr;
															
 
																-extern struct starpu_codelet accumulate_variable_cl;
															
 
																-extern struct starpu_codelet accumulate_vector_cl;
															
 
																-extern struct starpu_codelet bzero_variable_cl;
															
 
																-extern struct starpu_codelet bzero_vector_cl;
															
 
																+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
															
 
																+{
															
 
																+	unsigned b;
															
 
																+
															
 
																+	for (b = 0; b < nblocks; b++)
															
 
																+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 /*
															
 
																  *	Generate Input data
															
@@ -264,162 +261,48 @@ static void display_matrix(void)
 
																 }
															
 
																 #endif
															
 
																-/*
															
 
																- *	Main loop
															
 
																- */
															
 
																-
															
 
																-static int cg(void)
															
 
																+static void display_x_result(void)
															
 
																 {
															
 
																-	double delta_new, delta_0;
															
 
																-
															
 
																-	int i = 0;
															
 
																-	int ret;
															
 
																+	int j, i;
															
 
																+	starpu_data_handle_t sub;
															
 
																-	/* r <- b */
															
 
																-	ret = copy_handle(r_handle, b_handle, nblocks);
															
 
																-	if (ret == -ENODEV) return ret;
															
 
																+	FPRINTF(stderr, "Computed X vector:\n");
															
 
																-	/* r <- r - A x */
															
 
																-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
															
 
																-	if (ret == -ENODEV) return ret;
															
 
																+	int block_size = n / nblocks;
															
 
																-	/* d <- r */
															
 
																-	ret = copy_handle(d_handle, r_handle, nblocks);
															
 
																-	if (ret == -ENODEV) return ret;
															
 
																-
															
 
																-	/* delta_new = dot(r,r) */
															
 
																-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
															
 
																-	if (ret == -ENODEV) return ret;
															
 
																-
															
 
																-	starpu_data_acquire(rtr_handle, STARPU_R);
															
 
																-	delta_new = rtr;
															
 
																-	delta_0 = delta_new;
															
 
																-	starpu_data_release(rtr_handle);
															
 
																-
															
 
																-	FPRINTF(stderr, "*************** INITIAL ************ \n");
															
 
																-	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
															
 
																-
															
 
																-	double start;
															
 
																-	double end;
															
 
																-	start = starpu_timing_now();
															
 
																-
															
 
																-	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																 	{
															
 
																-		double delta_old;
															
 
																-		double alpha, beta;
															
 
																-
															
 
																-		starpu_iteration_push(i);
															
 
																-
															
 
																-		/* q <- A d */
															
 
																-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
															
 
																-
															
 
																-		/* dtq <- dot(d,q) */
															
 
																-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
															
 
																-
															
 
																-		/* alpha = delta_new / dtq */
															
 
																-		starpu_data_acquire(dtq_handle, STARPU_R);
															
 
																-		alpha = delta_new/dtq;
															
 
																-		starpu_data_release(dtq_handle);
															
 
																-
															
 
																-		/* x <- x + alpha d */
															
 
																-		axpy_kernel(x_handle, d_handle, alpha, nblocks);
															
 
																-
															
 
																-		if ((i % 50) == 0)
															
 
																-		{
															
 
																-			/* r <- b */
															
 
																-			copy_handle(r_handle, b_handle, nblocks);
															
 
																-
															
 
																-			/* r <- r - A x */
															
 
																-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
															
 
																-		}
															
 
																-		else
															
 
																-		{
															
 
																-			/* r <- r - alpha q */
															
 
																-			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
															
 
																-		}
															
 
																-
															
 
																-		/* delta_new = dot(r,r) */
															
 
																-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
															
 
																-
															
 
																-		starpu_data_acquire(rtr_handle, STARPU_R);
															
 
																-		delta_old = delta_new;
															
 
																-		delta_new = rtr;
															
 
																-		beta = delta_new / delta_old;
															
 
																-		starpu_data_release(rtr_handle);
															
 
																-
															
 
																-		/* d <- beta d + r */
															
 
																-		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
															
 
																-
															
 
																-		if ((i % 10) == 0)
															
 
																+		sub = starpu_data_get_sub_data(x_handle, 1, j);
															
 
																+		starpu_data_acquire(sub, STARPU_R);
															
 
																+		for (i = 0; i < block_size; i++)
															
 
																 		{
															
 
																-			/* We here take the error as ||r||_2 / (n||b||_2) */
															
 
																-			double error = sqrt(delta_new/delta_0)/(1.0*n);
															
 
																-			FPRINTF(stderr, "*****************************************\n");
															
 
																-			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
															
 
																+			FPRINTF(stderr, "% 02.2e\n", x[j*block_size + i]);
															
 
																 		}
															
 
																-
															
 
																-		starpu_iteration_pop();
															
 
																-		i++;
															
 
																+		starpu_data_release(sub);
															
 
																 	}
															
 
																-
															
 
																-	end = starpu_timing_now();
															
 
																-
															
 
																-	double timing = end - start;
															
 
																-	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
															
 
																-	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
															
 
																-	return 0;
															
 
																 }
															
 
																-static int check(void)
															
 
																-{
															
 
																-	return 0;
															
 
																-}
															
 
																 static void parse_args(int argc, char **argv)
															
 
																 {
															
 
																 	int i;
															
 
																 	for (i = 1; i < argc; i++)
															
 
																 	{
															
 
																-	        if (strcmp(argv[i], "-n") == 0)
															
 
																-		{
															
 
																-			n = (int long long)atoi(argv[++i]);
															
 
																-			continue;
															
 
																-		}
															
 
																-
															
 
																-	        if (strcmp(argv[i], "-maxiter") == 0)
															
 
																-		{
															
 
																-			i_max = atoi(argv[++i]);
															
 
																-			if (i_max <= 0)
															
 
																-			{
															
 
																-				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
															
 
																-				exit(EXIT_FAILURE);
															
 
																-			}
															
 
																-			continue;
															
 
																-		}
															
 
																-
															
 
																-	        if (strcmp(argv[i], "-nblocks") == 0)
															
 
																-		{
															
 
																-			nblocks = atoi(argv[++i]);
															
 
																-			continue;
															
 
																-		}
															
 
																-
															
 
																-	        if (strcmp(argv[i], "-no-reduction") == 0)
															
 
																-		{
															
 
																-			use_reduction = 0;
															
 
																-			continue;
															
 
																-		}
															
 
																-
															
 
																 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
															
 
																 		{
															
 
																-			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
															
 
																+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
															
 
																 			exit(-1);
															
 
																 		}
															
 
																-        }
															
 
																+	}
															
 
																+
															
 
																+	parse_common_args(argc, argv);
															
 
																 }
															
 
																+
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int ret;
															
 
																+	double start, end;
															
 
																 	/* Not supported yet */
															
 
																 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
															
@@ -434,9 +317,19 @@ int main(int argc, char **argv)
 
																 	starpu_cublas_init();
															
 
																+	FPRINTF(stderr, "************** PARAMETERS ***************\n");
															
 
																+	FPRINTF(stderr, "Problem size (-n): %lld\n", n);
															
 
																+	FPRINTF(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
															
 
																+	FPRINTF(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
															
 
																+	FPRINTF(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
															
 
																+
															
 
																+	start = starpu_timing_now();
															
 
																 	generate_random_problem();
															
 
																 	register_data();
															
 
																 	partition_data();
															
 
																+	end = starpu_timing_now();
															
 
																+
															
 
																+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/10e6);
															
 
																 	ret = cg();
															
 
																 	if (ret == -ENODEV)
															
@@ -445,10 +338,13 @@ int main(int argc, char **argv)
 
																 		goto enodev;
															
 
																 	}
															
 
																-	ret = check();
															
 
																-
															
 
																 	starpu_task_wait_for_all();
															
 
																+	if (display_result)
															
 
																+	{
															
 
																+		display_x_result();
															
 
																+	}
															
 
																+
															
 
																 enodev:
															
 
																 	unregister_data();
															
 
																 	free_data();
															
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -54,29 +54,4 @@
 
																 #define cublasscal	cublasSscal
															
 
																 #endif
															
 
																-int dot_kernel(starpu_data_handle_t v1,
															
 
																-	       starpu_data_handle_t v2,
															
 
																-	       starpu_data_handle_t s,
															
 
																-	       unsigned nblocks,
															
 
																-	       int use_reduction);
															
 
																-
															
 
																-int gemv_kernel(starpu_data_handle_t v1,
															
 
																-                starpu_data_handle_t matrix, 
															
 
																-                starpu_data_handle_t v2,
															
 
																-                TYPE p1, TYPE p2,
															
 
																-		unsigned nblocks,
															
 
																-		int use_reduction);
															
 
																-
															
 
																-int axpy_kernel(starpu_data_handle_t v1,
															
 
																-		starpu_data_handle_t v2, TYPE p1,
															
 
																-		unsigned nblocks);
															
 
																-
															
 
																-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
															
 
																-		     starpu_data_handle_t v2, TYPE p2,
															
 
																-		     unsigned nblocks);
															
 
																-
															
 
																-int copy_handle(starpu_data_handle_t dst,
															
 
																-		starpu_data_handle_t src,
															
 
																-		unsigned nblocks);
															
 
																-
															
 
																 #endif /* __STARPU_EXAMPLE_CG_H__ */
															
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -23,11 +23,43 @@
 
																 #include <limits.h>
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																+#include <cuda.h>
															
 
																 #include <starpu_cublas_v2.h>
															
 
																 static const TYPE gp1 = 1.0;
															
 
																 static const TYPE gm1 = -1.0;
															
 
																 #endif
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+
															
 
																+static int nblocks = 8;
															
 
																+
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+static int i_max = 5;
															
 
																+static int long long n = 2048;
															
 
																+#elif !defined(STARPU_LONG_CHECK)
															
 
																+static int long long n = 4096;
															
 
																+static int i_max = 100;
															
 
																+#else
															
 
																+static int long long n = 4096;
															
 
																+static int i_max = 1000;
															
 
																+#endif
															
 
																+static double eps = (10e-14);
															
 
																+
															
 
																+int use_reduction = 1;
															
 
																+int display_result = 0;
															
 
																+
															
 
																+HANDLE_TYPE_MATRIX A_handle;
															
 
																+HANDLE_TYPE_VECTOR b_handle;
															
 
																+HANDLE_TYPE_VECTOR x_handle;
															
 
																+
															
 
																+HANDLE_TYPE_VECTOR r_handle;
															
 
																+HANDLE_TYPE_VECTOR d_handle;
															
 
																+HANDLE_TYPE_VECTOR q_handle;
															
 
																+
															
 
																+starpu_data_handle_t dtq_handle;
															
 
																+starpu_data_handle_t rtr_handle;
															
 
																+TYPE dtq, rtr;
															
 
																+
															
 
																 #if 0
															
 
																 static void print_vector_from_descr(unsigned nx, TYPE *v)
															
 
																 {
															
@@ -120,7 +152,7 @@ struct starpu_codelet accumulate_variable_cl =
 
																 	.cuda_funcs = {accumulate_variable_cuda},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &accumulate_variable_model
															
 
																 };
															
@@ -164,7 +196,7 @@ struct starpu_codelet accumulate_vector_cl =
 
																 	.cuda_funcs = {accumulate_vector_cuda},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2,
															
 
																 	.model = &accumulate_vector_model
															
 
																 };
															
@@ -314,8 +346,8 @@ static struct starpu_codelet dot_kernel_cl =
 
																 	.model = &dot_kernel_model
															
 
																 };
															
 
																-int dot_kernel(starpu_data_handle_t v1,
															
 
																-	       starpu_data_handle_t v2,
															
 
																+int dot_kernel(HANDLE_TYPE_VECTOR v1,
															
 
																+	       HANDLE_TYPE_VECTOR v2,
															
 
																 	       starpu_data_handle_t s,
															
 
																 	       unsigned nblocks,
															
 
																 	       int use_reduction)
															
@@ -327,21 +359,21 @@ int dot_kernel(starpu_data_handle_t v1,
 
																 		starpu_data_invalidate_submit(s);
															
 
																 	else
															
 
																 	{
															
 
																-		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
															
 
																+		ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
															
 
																 		if (ret == -ENODEV) return ret;
															
 
																-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 	}
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																-		ret = starpu_task_insert(&dot_kernel_cl,
															
 
																+		ret = TASK_INSERT(&dot_kernel_cl,
															
 
																 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
															
 
																-					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
															
 
																-					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
															
 
																+					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
															
 
																+					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
															
 
																 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
															
 
																 					 0);
															
 
																-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
@@ -477,9 +509,9 @@ static struct starpu_codelet gemv_kernel_cl =
 
																 	.model = &gemv_kernel_model
															
 
																 };
															
 
																-int gemv_kernel(starpu_data_handle_t v1,
															
 
																-		starpu_data_handle_t matrix,
															
 
																-		starpu_data_handle_t v2,
															
 
																+int gemv_kernel(HANDLE_TYPE_VECTOR v1,
															
 
																+		HANDLE_TYPE_MATRIX matrix,
															
 
																+		HANDLE_TYPE_VECTOR v2,
															
 
																 		TYPE p1, TYPE p2,
															
 
																 		unsigned nblocks,
															
 
																 		int use_reduction)
															
@@ -489,13 +521,13 @@ int gemv_kernel(starpu_data_handle_t v1,
 
																 	for (b2 = 0; b2 < nblocks; b2++)
															
 
																 	{
															
 
																-		ret = starpu_task_insert(&scal_kernel_cl,
															
 
																-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
															
 
																+		ret = TASK_INSERT(&scal_kernel_cl,
															
 
																+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
															
 
																 					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
															
 
																 					 0);
															
 
																 		if (ret == -ENODEV) return ret;
															
 
																-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 	}
															
 
																 	for (b2 = 0; b2 < nblocks; b2++)
															
@@ -503,15 +535,15 @@ int gemv_kernel(starpu_data_handle_t v1,
 
																 		for (b1 = 0; b1 < nblocks; b1++)
															
 
																 		{
															
 
																 			TYPE one = 1.0;
															
 
																-			ret = starpu_task_insert(&gemv_kernel_cl,
															
 
																-						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
															
 
																-						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
															
 
																-						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
															
 
																+			ret = TASK_INSERT(&gemv_kernel_cl,
															
 
																+						 use_reduction?STARPU_REDUX:STARPU_RW,	GET_VECTOR_BLOCK(v1, b2),
															
 
																+						 STARPU_R,	GET_MATRIX_BLOCK(matrix, b2, b1),
															
 
																+						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
															
 
																 						 STARPU_VALUE,	&one,	sizeof(one),
															
 
																 						 STARPU_VALUE,	&p2,	sizeof(p2),
															
 
																 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
															
 
																 						 0);
															
 
																-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 		}
															
 
																 	}
															
 
																 	return 0;
															
@@ -582,23 +614,23 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 
																 	.model = &scal_axpy_kernel_model
															
 
																 };
															
 
																-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
															
 
																-		     starpu_data_handle_t v2, TYPE p2,
															
 
																+int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
															
 
																+		     HANDLE_TYPE_VECTOR v2, TYPE p2,
															
 
																 		     unsigned nblocks)
															
 
																 {
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																 		int ret;
															
 
																-		ret = starpu_task_insert(&scal_axpy_kernel_cl,
															
 
																-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																+		ret = TASK_INSERT(&scal_axpy_kernel_cl,
															
 
																+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
															
 
																+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
															
 
																 					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																 					 STARPU_VALUE, &p2, sizeof(p2),
															
 
																 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
															
 
																 					 0);
															
 
																 		if (ret == -ENODEV) return ret;
															
 
																-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
@@ -661,30 +693,177 @@ static struct starpu_codelet axpy_kernel_cl =
 
																 	.model = &axpy_kernel_model
															
 
																 };
															
 
																-int axpy_kernel(starpu_data_handle_t v1,
															
 
																-		starpu_data_handle_t v2, TYPE p1,
															
 
																+int axpy_kernel(HANDLE_TYPE_VECTOR v1,
															
 
																+		HANDLE_TYPE_VECTOR v2, TYPE p1,
															
 
																 		unsigned nblocks)
															
 
																 {
															
 
																 	unsigned b;
															
 
																 	for (b = 0; b < nblocks; b++)
															
 
																 	{
															
 
																 		int ret;
															
 
																-		ret = starpu_task_insert(&axpy_kernel_cl,
															
 
																-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
															
 
																-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
															
 
																+		ret = TASK_INSERT(&axpy_kernel_cl,
															
 
																+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
															
 
																+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
															
 
																 					 STARPU_VALUE, &p1, sizeof(p1),
															
 
																 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
															
 
																 					 0);
															
 
																 		if (ret == -ENODEV) return ret;
															
 
																-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
															
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
 
																-int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
															
 
																+
															
 
																+/*
															
 
																+ *	Main loop
															
 
																+ */
															
 
																+int cg(void)
															
 
																 {
															
 
																-	unsigned b;
															
 
																-	for (b = 0; b < nblocks; b++)
															
 
																-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
															
 
																+	TYPE delta_new, delta_0, error, delta_old, alpha, beta;
															
 
																+	double start, end, timing;
															
 
																+	int i = 0, ret;
															
 
																+
															
 
																+	/* r <- b */
															
 
																+	ret = copy_handle(r_handle, b_handle, nblocks);
															
 
																+	if (ret == -ENODEV) return ret;
															
 
																+
															
 
																+	/* r <- r - A x */
															
 
																+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
															
 
																+	if (ret == -ENODEV) return ret;
															
 
																+
															
 
																+	/* d <- r */
															
 
																+	ret = copy_handle(d_handle, r_handle, nblocks);
															
 
																+	if (ret == -ENODEV) return ret;
															
 
																+
															
 
																+	/* delta_new = dot(r,r) */
															
 
																+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
															
 
																+	if (ret == -ENODEV) return ret;
															
 
																+
															
 
																+	GET_DATA_HANDLE(rtr_handle);
															
 
																+	starpu_data_acquire(rtr_handle, STARPU_R);
															
 
																+	delta_new = rtr;
															
 
																+	delta_0 = delta_new;
															
 
																+	starpu_data_release(rtr_handle);
															
 
																+
															
 
																+	FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
															
 
																+
															
 
																+	FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
															
 
																+	FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
															
 
																+
															
 
																+	BARRIER();
															
 
																+	start = starpu_timing_now();
															
 
																+
															
 
																+	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
															
 
																+	{
															
 
																+		starpu_iteration_push(i);
															
 
																+
															
 
																+		/* q <- A d */
															
 
																+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
															
 
																+
															
 
																+		/* dtq <- dot(d,q) */
															
 
																+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
															
 
																+
															
 
																+		/* alpha = delta_new / dtq */
															
 
																+		GET_DATA_HANDLE(dtq_handle);
															
 
																+		starpu_data_acquire(dtq_handle, STARPU_R);
															
 
																+		alpha = delta_new / dtq;
															
 
																+		starpu_data_release(dtq_handle);
															
 
																+
															
 
																+		/* x <- x + alpha d */
															
 
																+		axpy_kernel(x_handle, d_handle, alpha, nblocks);
															
 
																+
															
 
																+		if ((i % 50) == 0)
															
 
																+		{
															
 
																+			/* r <- b */
															
 
																+			copy_handle(r_handle, b_handle, nblocks);
															
 
																+
															
 
																+			/* r <- r - A x */
															
 
																+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			/* r <- r - alpha q */
															
 
																+			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
															
 
																+		}
															
 
																+
															
 
																+		/* delta_new = dot(r,r) */
															
 
																+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
															
 
																+
															
 
																+		GET_DATA_HANDLE(rtr_handle);
															
 
																+		starpu_data_acquire(rtr_handle, STARPU_R);
															
 
																+		delta_old = delta_new;
															
 
																+		delta_new = rtr;
															
 
																+		beta = delta_new / delta_old;
															
 
																+		starpu_data_release(rtr_handle);
															
 
																+
															
 
																+		/* d <- beta d + r */
															
 
																+		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
															
 
																+
															
 
																+		if ((i % 10) == 0)
															
 
																+		{
															
 
																+			/* We here take the error as ||r||_2 / (n||b||_2) */
															
 
																+			error = sqrt(delta_new/delta_0)/(1.0*n);
															
 
																+			FPRINTF_SERVER(stderr, "*****************************************\n");
															
 
																+			FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
															
 
																+		}
															
 
																+
															
 
																+		starpu_iteration_pop();
															
 
																+		i++;
															
 
																+	}
															
 
																+
															
 
																+	BARRIER();
															
 
																+	end = starpu_timing_now();
															
 
																+	timing = end - start;
															
 
																+
															
 
																+	error = sqrt(delta_new/delta_0)/(1.0*n);
															
 
																+	FPRINTF_SERVER(stderr, "*****************************************\n");
															
 
																+	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
															
 
																+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
															
 
																+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
															
 
																+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
 
																+
															
 
																+
															
 
																+void parse_common_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-n") == 0)
															
 
																+		{
															
 
																+			n = (int long long)atoi(argv[++i]);
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-display-result") == 0)
															
 
																+		{
															
 
																+			display_result = 1;
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-maxiter") == 0)
															
 
																+		{
															
 
																+			i_max = atoi(argv[++i]);
															
 
																+			if (i_max <= 0)
															
 
																+			{
															
 
																+				FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
															
 
																+				exit(EXIT_FAILURE);
															
 
																+			}
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-nblocks") == 0)
															
 
																+		{
															
 
																+			nblocks = atoi(argv[++i]);
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-no-reduction") == 0)
															
 
																+		{
															
 
																+			use_reduction = 0;
															
 
																+			continue;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
--- a/examples/pi/pi_redux.c
+++ b/examples/pi/pi_redux.c
@@ -322,7 +322,7 @@ static struct starpu_codelet redux_codelet =
 
																 	.cuda_funcs = {redux_cuda_func},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
 
																 #endif
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2
															
 
																 };
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -211,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 
																 	.opencl_funcs = {redux_opencl_func},
															
 
																 	.opencl_flags = {STARPU_OPENCL_ASYNC},
															
 
																 #endif
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2,
															
 
																 	.name = "redux"
															
 
																 };
															
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 
																 {
															
 
																 	.cpu_funcs = {minmax_redux_cpu_func},
															
 
																 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2,
															
 
																 	.name = "redux"
															
 
																 };
															
--- a/include/fstarpu_mod.f90
+++ b/include/fstarpu_mod.f90
@@ -25,6 +25,7 @@ module fstarpu_mod
 
																         type(c_ptr), bind(C) :: FSTARPU_RW
															
 
																         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
															
 
																         type(c_ptr), bind(C) :: FSTARPU_REDUX
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_MPI_REDUX
															
 
																         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_SSEND
															
 
																         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
															
@@ -36,11 +37,15 @@ module fstarpu_mod
 
																         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
															
 
																         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
															
 
																         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG_NFREE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG_NFREE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
															
 
																         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
															
 
																         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
															
 
																         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
															
 
																         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
															
@@ -2395,6 +2400,7 @@ module fstarpu_mod
 
																                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
															
 
																                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
															
 
																                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
															
 
																+                        FSTARPU_MPI_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_MPI_REDUX"//C_NULL_CHAR)
															
 
																                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
															
 
																                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
															
 
																                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
															
@@ -2406,12 +2412,19 @@ module fstarpu_mod
 
																                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
															
 
																                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
															
 
																                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
															
 
																+                        FSTARPU_CALLBACK_WITH_ARG_NFREE       = &
															
 
																+                                fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG_NFREE"//C_NULL_CHAR)
															
 
																                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
															
 
																+                        FSTARPU_CALLBACK_ARG_NFREE    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
															
 
																                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
															
 
																                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
															
 
																+                        FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE   = &
															
 
																+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
															
 
																                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
															
 
																                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
															
 
																                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
															
 
																+                        FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE       = &
															
 
																+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"//C_NULL_CHAR)
															
 
																                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
															
 
																                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
															
 
																                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -471,6 +471,14 @@ struct starpu_conf
 
																 	   Maximum spinning backoff of drivers. Default value: \c 32
															
 
																 	 */
															
 
																 	unsigned driver_spinning_backoff_max;
															
 
																+
															
 
																+	/**
															
 
																+	   Specify if CUDA workers should do only fast allocations
															
 
																+	   when running the datawizard progress of
															
 
																+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
															
 
																+	   Default value is 0, allowing CUDA workers to do slow allocations.
															
 
																+	 */
															
 
																+	int cuda_only_fast_alloc_other_memnodes;
															
 
																 };
															
 
																 /**
															
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -110,7 +110,15 @@ enum starpu_data_access_mode
 
																 				   src/sched_policies/work_stealing_policy.c
															
 
																 				   source code.
															
 
																 				*/
															
 
																-	STARPU_ACCESS_MODE_MAX=(1<<7) /**< todo */
															
 
																+	STARPU_MPI_REDUX=(1<<7), /** Inter-node reduction only. Codelets 
															
 
																+				    contributing to these reductions should
															
 
																+				    be registered with STARPU_RW | STARPU_COMMUTE 
															
 
																+				    access modes.
															
 
																+			            When inserting these tasks through the
															
 
																+				    MPI layer however, the access mode needs
															
 
																+				    to be STARPU_MPI_REDUX. */
															
 
																+	STARPU_ACCESS_MODE_MAX=(1<<8) /** The purpose of ACCESS_MODE_MAX is to
															
 
																+					be the maximum of this enum. */
															
 
																 };
															
 
																 struct starpu_data_interface_ops;
															
@@ -305,7 +313,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
																    This is a very internal interface, subject to changes, do not use this.
															
 
																 */
															
 
																-int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid);
															
 
																+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid, int prio);
															
 
																 /**
															
 
																    The application can call this function instead of starpu_data_acquire() so as to
															
@@ -560,8 +568,10 @@ struct starpu_codelet;
 
																 /**
															
 
																    Set the codelets to be used for \p handle when it is accessed in the
															
 
																    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
															
 
																-   the codelet \p init_cl, and reduction between per-worker buffers will be
															
 
																-   done with the codelet \p redux_cl.
															
 
																+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
															
 
																+   reduction between per-worker buffers will be done with the codelet \p
															
 
																+   redux_cl (which has to take a first accumulation handle with
															
 
																+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
															
 
																 */
															
 
																 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
															
--- a/include/starpu_hash.h
+++ b/include/starpu_hash.h
@@ -39,6 +39,14 @@ extern "C"
 
																 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
															
 
																 /**
															
 
																+   Compute the CRC of a pointer value seeded by the \p inputcrc
															
 
																+   <em>current state</em>. The return value should be considered as the new
															
 
																+   <em>current state</em> for future CRC computation. This is used for computing
															
 
																+   data size footprint.
															
 
																+*/
															
 
																+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc);
															
 
																+
															
 
																+/**
															
 
																    Compute the CRC of a 32bit number seeded by the \p inputcrc
															
 
																    <em>current state</em>. The return value should be considered as the new
															
 
																    <em>current state</em> for future CRC computation. This is used for computing
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -310,10 +310,10 @@ struct starpu_perfmodel
 
																 void starpu_perfmodel_init(struct starpu_perfmodel *model);
															
 
																 /**
															
 
																-   Deinitialize the \p model performance model structure. You need to call this 
															
 
																-   before deallocating the structure. You will probably want to call 
															
 
																+   Deinitialize the \p model performance model structure. You need to call this
															
 
																+   before deallocating the structure. You will probably want to call
															
 
																    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
															
 
																-*/   
															
 
																+*/
															
 
																 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
															
 
																 /**
															
@@ -322,7 +322,6 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
																    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
															
 
																    - \p archi is the type of architecture on which calibration will be run
															
 
																 */
															
 
																-
															
 
																 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
															
 
																 /**
															
@@ -335,7 +334,6 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
																    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
															
 
																    - \p archi is the type of architecture on which calibration was run
															
 
																 */
															
 
																-
															
 
																 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -861,7 +861,28 @@ struct starpu_task
 
																 	*/
															
 
																 	void *prologue_callback_arg;
															
 
																+	/** Optional field, the default value is <c>NULL</c>. This is a
															
 
																+	   function pointer of prototype <c>void (*f)(void*)</c>
															
 
																+	   which specifies a possible callback. If this pointer is
															
 
																+	   non-<c>NULL</c>, the callback function is executed on the host
															
 
																+	   when the task is pop-ed from the scheduler, just before getting
															
 
																+	   executed. The callback is passed the value contained in the
															
 
																+	   starpu_task::prologue_callback_pop_arg field.
															
 
																+	   No callback is executed if the field is set to <c>NULL</c>.
															
 
																+
															
 
																+	   With starpu_task_insert() and alike this can be specified thanks to
															
 
																+	   ::STARPU_PROLOGUE_CALLBACK_POP followed by the function pointer.
															
 
																+	*/
															
 
																 	void (*prologue_callback_pop_func)(void *);
															
 
																+	/**
															
 
																+	   Optional field, the default value is <c>NULL</c>. This is
															
 
																+	   the pointer passed to the prologue_callback_pop function. This
															
 
																+	   field is ignored if the field
															
 
																+	   starpu_task::prologue_callback_pop_func is set to <c>NULL</c>.
															
 
																+
															
 
																+	   With starpu_task_insert() and alike this can be specified thanks to
															
 
																+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG followed by the argument.
															
 
																+	   */
															
 
																 	void *prologue_callback_pop_arg;
															
 
																 	/**
															
@@ -1424,8 +1445,13 @@ struct starpu_task
 
																 	do {								\
															
 
																 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
															
 
																 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
															
 
																-		else							\
															
 
																-			STARPU_CODELET_SET_MODE((task)->cl, mode, i);	\
															
 
																+		else \
															
 
																+		{							\
															
 
																+			enum starpu_data_access_mode cl_mode = STARPU_CODELET_GET_MODE((task)->cl, i); \
															
 
																+			STARPU_ASSERT_MSG(cl_mode == mode,	\
															
 
																+				"Task <%s> can't set its  %d-th buffer mode to %d as the codelet it derives from uses %d", \
															
 
																+				(task)->cl->name, i, mode, cl_mode);	\
															
 
																+		} \
															
 
																 	} while(0)
															
 
																 /**
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -257,6 +257,10 @@ extern "C"
 
																 	_starpu_abort();				\
															
 
																 } while(0)
															
 
																+#if defined(_MSC_VER)
															
 
																+  #undef STARPU_HAVE_STRERROR_R
															
 
																+#endif
															
 
																+
															
 
																 #if defined(STARPU_HAVE_STRERROR_R)
															
 
																 #if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
															
 
																 /* XSI-compliant version of strerror_r returns an int */
															
--- a/julia/README
+++ b/julia/README
@@ -20,8 +20,8 @@ $ make
 
																 Then, you need to add the lib/ directory to your library path and the julia/
															
 
																 directory to your Julia load path:
															
 
																-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
															
 
																-$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
															
 
																+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/src/.lib
															
 
																+$ export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
															
 
																 This step can also be done by sourcing the setenv.sh script:
															
--- a/julia/examples/execute.sh.in
+++ b/julia/examples/execute.sh.in
@@ -1,7 +1,7 @@
 
																 #!@REALBASH@
															
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -16,7 +16,7 @@
 
																 #
															
 
																 set -x
															
 
																-export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
															
 
																+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia/src:$JULIA_LOAD_PATH
															
 
																 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
															
 
																 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
															
 
																 export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
															
--- a/julia/setenv.sh
+++ b/julia/setenv.sh
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -13,7 +13,7 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 #
															
 
																-export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
															
 
																+export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
															
 
																 if [ `uname` == "Darwin" ]; then
															
 
																     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$PWD/lib/
															
--- a/julia/src/StarPU.jl
+++ b/julia/src/StarPU.jl
@@ -65,7 +65,7 @@ export STARPU_HISTORY_BASED, STARPU_REGRESSION_BASED
 
																 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
															
 
																 export starpu_tag_t
															
 
																 export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
															
 
																-export STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
															
 
																+export STARPU_MPI_REDUX, STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
															
 
																 export STARPU_ACCESS_MODE_MAX
															
 
																 # BLAS
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -272,9 +272,27 @@ starpu_mpi_EXAMPLES +=				\
 
																 	matrix_decomposition/mpi_cholesky_distributed
															
 
																 endif
															
 
																-########################
															
 
																+##############
															
 
																+# CG example #
															
 
																+##############
															
 
																+
															
 
																+if !STARPU_SIMGRID
															
 
																+if !STARPU_NO_BLAS_LIB
															
 
																+examplebin_PROGRAMS += cg/cg
															
 
																+starpu_mpi_EXAMPLES += cg/cg
															
 
																+
															
 
																+cg_cg_SOURCES =					\
															
 
																+	cg/cg.c						\
															
 
																+	../../examples/common/blas.c
															
 
																+
															
 
																+cg_cg_LDADD =					\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+endif
															
 
																+endif
															
 
																+
															
 
																+###########################
															
 
																 # MPI Matrix mult example #
															
 
																-########################
															
 
																+###########################
															
 
																 examplebin_PROGRAMS +=		\
															
 
																 	matrix_mult/mm
															
@@ -290,6 +308,24 @@ starpu_mpi_EXAMPLES +=				\
 
																 	matrix_mult/mm
															
 
																 endif
															
 
																+########################
															
 
																+# MPI STARPU_MPI_REDUX #
															
 
																+########################
															
 
																+
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	mpi_redux/mpi_redux
															
 
																+
															
 
																+mpi_redux_mpi_redux_SOURCES	=		\
															
 
																+	mpi_redux/mpi_redux.c
															
 
																+
															
 
																+mpi_redux_mpi_redux_LDADD =			\
															
 
																+	-lm
															
 
																+
															
 
																+if !STARPU_SIMGRID
															
 
																+starpu_mpi_EXAMPLES +=				\
															
 
																+	mpi_redux/mpi_redux
															
 
																+endif
															
 
																+
															
 
																 ##########################################
															
 
																 # Native Fortran MPI Matrix mult example #
															
 
																 ##########################################
															
@@ -336,6 +372,55 @@ endif
 
																 endif
															
 
																 endif
															
 
																+########################################
															
 
																+# Native Fortran MPI STARPU_REDUX test #
															
 
																+########################################
															
 
																+
															
 
																+if STARPU_HAVE_MPIFORT
															
 
																+if !STARPU_SANITIZE
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	native_fortran/nf_mpi_redux
															
 
																+
															
 
																+native_fortran_nf_mpi_redux_SOURCES	=			\
															
 
																+	native_fortran/fstarpu_mpi_mod.f90	\
															
 
																+	native_fortran/fstarpu_mod.f90		\
															
 
																+	native_fortran/nf_mpi_redux.f90	
															
 
																+
															
 
																+native_fortran_nf_mpi_redux_LDADD =					\
															
 
																+	-lm
															
 
																+
															
 
																+if !STARPU_SIMGRID
															
 
																+starpu_mpi_EXAMPLES +=				\
															
 
																+	native_fortran/nf_mpi_redux
															
 
																+endif
															
 
																+endif
															
 
																+endif
															
 
																+
															
 
																+########################################
															
 
																+# Native Fortran MPI STARPU_REDUX test #
															
 
																+########################################
															
 
																+
															
 
																+if STARPU_HAVE_MPIFORT
															
 
																+if !STARPU_SANITIZE
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	native_fortran/nf_redux_test
															
 
																+
															
 
																+native_fortran_nf_redux_test_SOURCES	=			\
															
 
																+	native_fortran/fstarpu_mpi_mod.f90	\
															
 
																+	native_fortran/fstarpu_mod.f90		\
															
 
																+	native_fortran/nf_redux_test.f90	
															
 
																+
															
 
																+native_fortran_nf_redux_test_LDADD =					\
															
 
																+	-lm
															
 
																+
															
 
																+if !STARPU_SIMGRID
															
 
																+starpu_mpi_EXAMPLES +=				\
															
 
																+	native_fortran/nf_redux_test
															
 
																+endif
															
 
																+endif
															
 
																+endif
															
 
																+
															
 
																+
															
 
																 ###################
															
 
																 # complex example #
															
 
																 ###################
															
@@ -427,6 +512,8 @@ native_fortran/nf_mm_cl.o: fstarpu_mod.mod
 
																 native_fortran/nf_mm.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																 native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																+native_fortran/nf_redux_test.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																+native_fortran/nf_mpi_redux.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																 endif
															
 
																 endif
															
--- a/mpi/examples/cg/cg.c
+++ b/mpi/examples/cg/cg.c
@@ -0,0 +1,422 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <math.h>
															
 
																+#include <assert.h>
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <common/blas.h>
															
 
																+
															
 
																+/*
															
 
																+ * Distributed version of Conjugate Gradient implemented in examples/cg/cg.c
															
 
																+ *
															
 
																+ * Use -display-result option and compare with the non-distributed version: the
															
 
																+ * x vector should be the same.
															
 
																+ */
															
 
																+
															
 
																+#include "../../../examples/cg/cg.h"
															
 
																+
															
 
																+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks);
															
 
																+
															
 
																+#define HANDLE_TYPE_VECTOR starpu_data_handle_t*
															
 
																+#define HANDLE_TYPE_MATRIX starpu_data_handle_t**
															
 
																+#define TASK_INSERT(cl, ...) starpu_mpi_task_insert(MPI_COMM_WORLD, cl, ##__VA_ARGS__)
															
 
																+#define GET_VECTOR_BLOCK(v, i) v[i]
															
 
																+#define GET_MATRIX_BLOCK(m, i, j) m[i][j]
															
 
																+#define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+#define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
															
 
																+
															
 
																+static int block_size;
															
 
																+
															
 
																+static int rank;
															
 
																+static int nodes_p = 2;
															
 
																+static int nodes_q;
															
 
																+
															
 
																+static TYPE ***A;
															
 
																+static TYPE **x;
															
 
																+static TYPE **b;
															
 
																+
															
 
																+static TYPE **r;
															
 
																+static TYPE **d;
															
 
																+static TYPE **q;
															
 
																+
															
 
																+#define FPRINTF_SERVER(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT") && rank == 0) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+
															
 
																+#include "../../../examples/cg/cg_kernels.c"
															
 
																+
															
 
																+static int my_distrib(const int y, const int x)
															
 
																+{
															
 
																+	return (y%nodes_q)*nodes_p + (x%nodes_p);
															
 
																+}
															
 
																+
															
 
																+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks)
															
 
																+{
															
 
																+	unsigned b;
															
 
																+
															
 
																+	for (b = 0; b < nblocks; b++)
															
 
																+	{
															
 
																+		if (rank == my_distrib(b, 0))
															
 
																+		{
															
 
																+			starpu_data_cpy(dst[b], src[b], /* asynchronous */ 1, /* without callback */ NULL, NULL);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Generate Input data
															
 
																+ */
															
 
																+static void generate_random_problem(void)
															
 
																+{
															
 
																+	unsigned nn, mm, m, n, mpi_rank;
															
 
																+
															
 
																+	A = malloc(nblocks * sizeof(TYPE **));
															
 
																+	x = malloc(nblocks * sizeof(TYPE *));
															
 
																+	b = malloc(nblocks * sizeof(TYPE *));
															
 
																+
															
 
																+	r = malloc(nblocks * sizeof(TYPE *));
															
 
																+	d = malloc(nblocks * sizeof(TYPE *));
															
 
																+	q = malloc(nblocks * sizeof(TYPE *));
															
 
																+
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+	{
															
 
																+		A[m] = malloc(nblocks * sizeof(TYPE*));
															
 
																+
															
 
																+		mpi_rank = my_distrib(m, 0);
															
 
																+
															
 
																+		if (mpi_rank == rank || display_result)
															
 
																+		{
															
 
																+			starpu_malloc((void**) &x[m], block_size*sizeof(TYPE));
															
 
																+		}
															
 
																+
															
 
																+		if (mpi_rank == rank)
															
 
																+		{
															
 
																+			starpu_malloc((void**) &b[m], block_size*sizeof(TYPE));
															
 
																+			starpu_malloc((void**) &r[m], block_size*sizeof(TYPE));
															
 
																+			starpu_malloc((void**) &d[m], block_size*sizeof(TYPE));
															
 
																+			starpu_malloc((void**) &q[m], block_size*sizeof(TYPE));
															
 
																+
															
 
																+			for (mm = 0; mm < block_size; mm++)
															
 
																+			{
															
 
																+				x[m][mm] = (TYPE) 0.0;
															
 
																+				b[m][mm] = (TYPE) 1.0;
															
 
																+				r[m][mm] = (TYPE) 0.0;
															
 
																+				d[m][mm] = (TYPE) 0.0;
															
 
																+				q[m][mm] = (TYPE) 0.0;
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		for (n = 0; n < nblocks; n++)
															
 
																+		{
															
 
																+			mpi_rank = my_distrib(m, n);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				starpu_malloc((void**) &A[m][n], block_size*block_size*sizeof(TYPE));
															
 
																+
															
 
																+				for (nn = 0; nn < block_size; nn++)
															
 
																+				{
															
 
																+					for (mm = 0; mm < block_size; mm++)
															
 
																+					{
															
 
																+						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
															
 
																+						A[m][n][mm + nn*block_size] = (TYPE) (1.0/(1.0+(nn+(m*block_size)+mm+(n*block_size))));
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void free_data(void)
															
 
																+{
															
 
																+	unsigned nn, mm, m, n, mpi_rank;
															
 
																+
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+	{
															
 
																+		mpi_rank = my_distrib(m, 0);
															
 
																+
															
 
																+		if (mpi_rank == rank || display_result)
															
 
																+		{
															
 
																+			starpu_free((void*) x[m]);
															
 
																+		}
															
 
																+
															
 
																+		if (mpi_rank == rank)
															
 
																+		{
															
 
																+			starpu_free((void*) b[m]);
															
 
																+			starpu_free((void*) r[m]);
															
 
																+			starpu_free((void*) d[m]);
															
 
																+			starpu_free((void*) q[m]);
															
 
																+		}
															
 
																+
															
 
																+		for (n = 0; n < nblocks; n++)
															
 
																+		{
															
 
																+			mpi_rank = my_distrib(m, n);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				starpu_free((void*) A[m][n]);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		free(A[m]);
															
 
																+	}
															
 
																+
															
 
																+	free(A);
															
 
																+	free(x);
															
 
																+	free(b);
															
 
																+	free(r);
															
 
																+	free(d);
															
 
																+	free(q);
															
 
																+}
															
 
																+
															
 
																+static void register_data(void)
															
 
																+{
															
 
																+	unsigned m, n;
															
 
																+	int mpi_rank;
															
 
																+	starpu_mpi_tag_t mpi_tag = 0;
															
 
																+
															
 
																+	A_handle = malloc(nblocks*sizeof(starpu_data_handle_t*));
															
 
																+	x_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+	b_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+	r_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+	{
															
 
																+		mpi_rank = my_distrib(m, 0);
															
 
																+		A_handle[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+
															
 
																+		if (mpi_rank == rank || display_result)
															
 
																+		{
															
 
																+			starpu_vector_data_register(&x_handle[m], STARPU_MAIN_RAM, (uintptr_t) x[m], block_size, sizeof(TYPE));
															
 
																+		}
															
 
																+		else if (!display_result)
															
 
																+		{
															
 
																+			assert(mpi_rank != rank);
															
 
																+			starpu_vector_data_register(&x_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
															
 
																+		}
															
 
																+
															
 
																+		if (mpi_rank == rank)
															
 
																+		{
															
 
																+			starpu_vector_data_register(&b_handle[m], STARPU_MAIN_RAM, (uintptr_t) b[m], block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&r_handle[m], STARPU_MAIN_RAM, (uintptr_t) r[m], block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&d_handle[m], STARPU_MAIN_RAM, (uintptr_t) d[m], block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&q_handle[m], STARPU_MAIN_RAM, (uintptr_t) q[m], block_size, sizeof(TYPE));
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_vector_data_register(&b_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&r_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&d_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
															
 
																+			starpu_vector_data_register(&q_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
															
 
																+		}
															
 
																+
															
 
																+		starpu_data_set_coordinates(x_handle[m], 1, m);
															
 
																+		starpu_mpi_data_register(x_handle[m], ++mpi_tag, mpi_rank);
															
 
																+		starpu_data_set_coordinates(b_handle[m], 1, m);
															
 
																+		starpu_mpi_data_register(b_handle[m], ++mpi_tag, mpi_rank);
															
 
																+		starpu_data_set_coordinates(r_handle[m], 1, m);
															
 
																+		starpu_mpi_data_register(r_handle[m], ++mpi_tag, mpi_rank);
															
 
																+		starpu_data_set_coordinates(d_handle[m], 1, m);
															
 
																+		starpu_mpi_data_register(d_handle[m], ++mpi_tag, mpi_rank);
															
 
																+		starpu_data_set_coordinates(q_handle[m], 1, m);
															
 
																+		starpu_mpi_data_register(q_handle[m], ++mpi_tag, mpi_rank);
															
 
																+
															
 
																+		if (use_reduction)
															
 
																+		{
															
 
																+			starpu_data_set_reduction_methods(q_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
															
 
																+			starpu_data_set_reduction_methods(r_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
															
 
																+		}
															
 
																+
															
 
																+		for (n = 0; n < nblocks; n++)
															
 
																+		{
															
 
																+			mpi_rank = my_distrib(m, n);
															
 
																+
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				starpu_matrix_data_register(&A_handle[m][n], STARPU_MAIN_RAM, (uintptr_t) A[m][n], block_size, block_size, block_size, sizeof(TYPE));
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				starpu_matrix_data_register(&A_handle[m][n], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
															
 
																+			}
															
 
																+
															
 
																+			starpu_data_set_coordinates(A_handle[m][n], 2, n, m);
															
 
																+			starpu_mpi_data_register(A_handle[m][n], ++mpi_tag, mpi_rank);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_variable_data_register(&dtq_handle, STARPU_MAIN_RAM, (uintptr_t)&dtq, sizeof(TYPE));
															
 
																+	starpu_variable_data_register(&rtr_handle, STARPU_MAIN_RAM, (uintptr_t)&rtr, sizeof(TYPE));
															
 
																+	starpu_mpi_data_register(rtr_handle, ++mpi_tag, 0);
															
 
																+	starpu_mpi_data_register(dtq_handle, ++mpi_tag, 0);
															
 
																+
															
 
																+	if (use_reduction)
															
 
																+	{
															
 
																+		starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
															
 
																+		starpu_data_set_reduction_methods(rtr_handle, &accumulate_variable_cl, &bzero_variable_cl);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void unregister_data(void)
															
 
																+{
															
 
																+	unsigned m, n;
															
 
																+
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+	{
															
 
																+		starpu_data_unregister(x_handle[m]);
															
 
																+		starpu_data_unregister(b_handle[m]);
															
 
																+		starpu_data_unregister(r_handle[m]);
															
 
																+		starpu_data_unregister(d_handle[m]);
															
 
																+		starpu_data_unregister(q_handle[m]);
															
 
																+
															
 
																+		for (n = 0; n < nblocks; n++)
															
 
																+		{
															
 
																+			starpu_data_unregister(A_handle[m][n]);
															
 
																+		}
															
 
																+
															
 
																+		free(A_handle[m]);
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_unregister(dtq_handle);
															
 
																+	starpu_data_unregister(rtr_handle);
															
 
																+
															
 
																+	free(A_handle);
															
 
																+	free(x_handle);
															
 
																+	free(b_handle);
															
 
																+	free(r_handle);
															
 
																+	free(d_handle);
															
 
																+	free(q_handle);
															
 
																+}
															
 
																+
															
 
																+static void display_x_result(void)
															
 
																+{
															
 
																+	int j, i;
															
 
																+
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		starpu_mpi_get_data_on_node(MPI_COMM_WORLD, x_handle[j], 0);
															
 
																+	}
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		FPRINTF_SERVER(stderr, "Computed X vector:\n");
															
 
																+		for (j = 0; j < nblocks; j++)
															
 
																+		{
															
 
																+			starpu_data_acquire(x_handle[j], STARPU_R);
															
 
																+			for (i = 0; i < block_size; i++)
															
 
																+			{
															
 
																+				FPRINTF(stderr, "% 02.2e\n", x[j][i]);
															
 
																+			}
															
 
																+			starpu_data_release(x_handle[j]);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-p") == 0)
															
 
																+		{
															
 
																+			nodes_p = atoi(argv[++i]);
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
															
 
																+		{
															
 
																+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-p node_grid_width] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
															
 
																+			exit(-1);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	parse_common_args(argc, argv);
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int worldsize, ret;
															
 
																+	double start, end;
															
 
																+
															
 
																+	/* Not supported yet */
															
 
																+	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
															
 
																+		return 77;
															
 
																+
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	if (worldsize % nodes_p != 0)
															
 
																+	{
															
 
																+		FPRINTF_SERVER(stderr, "Node grid (%d) width must divide the number of nodes (%d).\n", nodes_p, worldsize);
															
 
																+		starpu_mpi_shutdown();
															
 
																+		return 1;
															
 
																+	}
															
 
																+	nodes_q = worldsize / nodes_p;
															
 
																+
															
 
																+	if (n % nblocks != 0)
															
 
																+	{
															
 
																+		FPRINTF_SERVER(stderr, "The number of blocks (%d) must divide the matrix size (%lld).\n", nblocks, n);
															
 
																+		starpu_mpi_shutdown();
															
 
																+		return 1;
															
 
																+	}
															
 
																+	block_size = n / nblocks;
															
 
																+
															
 
																+	starpu_cublas_init();
															
 
																+
															
 
																+	FPRINTF_SERVER(stderr, "************** PARAMETERS ***************\n");
															
 
																+	FPRINTF_SERVER(stderr, "%d nodes (%dx%d)\n", worldsize, nodes_p, nodes_q);
															
 
																+	FPRINTF_SERVER(stderr, "Problem size (-n): %lld\n", n);
															
 
																+	FPRINTF_SERVER(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
															
 
																+	FPRINTF_SERVER(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
															
 
																+	FPRINTF_SERVER(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
															
 
																+
															
 
																+	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	start = starpu_timing_now();
															
 
																+	generate_random_problem();
															
 
																+	register_data();
															
 
																+	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	end = starpu_timing_now();
															
 
																+
															
 
																+	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/10e6);
															
 
																+
															
 
																+	ret = cg();
															
 
																+	if (ret == -ENODEV)
															
 
																+	{
															
 
																+		ret = 77;
															
 
																+		goto enodev;
															
 
																+	}
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	if (display_result)
															
 
																+	{
															
 
																+		display_x_result();
															
 
																+	}
															
 
																+
															
 
																+enodev:
															
 
																+	unregister_data();
															
 
																+	free_data();
															
 
																+	starpu_cublas_shutdown();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	return ret;
															
 
																+}
															
--- a/mpi/examples/mpi_redux/mpi_redux.c
+++ b/mpi/examples/mpi_redux/mpi_redux.c
@@ -0,0 +1,201 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * This example illustrates how to use the STARPU_MPI_REDUX mode
															
 
																+ * and compare it with the standard STARPU_REDUX.
															
 
																+ *
															
 
																+ * In order to make this comparison salliant, the init codelet is not
															
 
																+ * a task that set the handle to a neutral element but rather depends
															
 
																+ * on the working node.
															
 
																+ * This is not a proper way to use a reduction pattern however it
															
 
																+ * can be analogous to the cost/weight of each contribution.
															
 
																+ */
															
 
																+
															
 
																+#include <stdlib.h>
															
 
																+#include <stdio.h>
															
 
																+#include <assert.h>
															
 
																+#include <math.h>
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+#include <unistd.h>
															
 
																+
															
 
																+static void cl_cpu_work(void *handles[], void*arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
															
 
																+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
															
 
																+	sleep(2);
															
 
																+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
															
 
																+	*a = 3.0 + *a + *b;
															
 
																+	printf("%f\n",*a);
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet work_cl =
															
 
																+{
															
 
																+	.cpu_funcs = { cl_cpu_work },
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_REDUX, STARPU_R },
															
 
																+	.name = "task_init"
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet mpi_work_cl =
															
 
																+{
															
 
																+	.cpu_funcs = { cl_cpu_work },
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
															
 
																+	.name = "task_init-mpi"
															
 
																+};
															
 
																+
															
 
																+static void cl_cpu_task_init(void *handles[], void*arg)
															
 
																+{
															
 
																+	(void) arg;
															
 
																+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
															
 
																+	sleep(1);
															
 
																+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), *a);
															
 
																+	*a = starpu_mpi_world_rank();
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet task_init_cl =
															
 
																+{
															
 
																+	.cpu_funcs = { cl_cpu_task_init },
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = { STARPU_W },
															
 
																+	.name = "task_init"
															
 
																+};
															
 
																+
															
 
																+static void cl_cpu_task_red(void *handles[], void*arg)
															
 
																+{
															
 
																+	(void) arg;
															
 
																+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
															
 
																+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
															
 
																+	sleep(2);
															
 
																+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
															
 
																+	*ad = *ad + *as;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet task_red_cl =
															
 
																+{
															
 
																+	.cpu_funcs = { cl_cpu_task_red },
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_RW, STARPU_R },
															
 
																+	.name = "task_red"
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char *argv[])
															
 
																+{
															
 
																+	int comm_rank, comm_size;
															
 
																+	/* Initializes STarPU and the StarPU-MPI layer */
															
 
																+	starpu_fxt_autostart_profiling(0);
															
 
																+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
															
 
																+
															
 
																+	int nworkers = starpu_cpu_worker_get_count();
															
 
																+	if (nworkers < 2)
															
 
																+	{
															
 
																+        	FPRINTF(stderr, "We need at least 2 CPU worker per node.\n");
															
 
																+        	starpu_mpi_shutdown();
															
 
																+       		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
															
 
																+	if (comm_size < 2)
															
 
																+	{
															
 
																+        	FPRINTF(stderr, "We need at least 2 nodes.\n");
															
 
																+        	starpu_mpi_shutdown();
															
 
																+       		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
															
 
																+
															
 
																+	double a, b[comm_size];
															
 
																+	starpu_data_handle_t a_h, b_h[comm_size];
															
 
																+	double work_coef = 2;
															
 
																+	enum starpu_data_access_mode codelet_mode;
															
 
																+	enum starpu_data_access_mode task_mode;
															
 
																+	int i,j,work_node;
															
 
																+    	starpu_mpi_tag_t tag = 0;
															
 
																+	for (i = 0 ; i < 2 ; i++)
															
 
																+	{
															
 
																+		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+		if (i==0)
															
 
																+			task_mode = STARPU_MPI_REDUX;
															
 
																+		else
															
 
																+			task_mode = STARPU_REDUX;
															
 
																+		if (comm_rank == 0)
															
 
																+		{
															
 
																+			a = 1.0;
															
 
																+			printf("init a = %f\n", a);
															
 
																+			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
															
 
																+			for (j=0;j<comm_size;j++)
															
 
																+				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			b[comm_rank] = 1.0 / (comm_rank + 1.0);
															
 
																+			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
															
 
																+			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
															
 
																+			for (j=0;j<comm_size;j++)
															
 
																+			{
															
 
																+				if (j == comm_rank)
															
 
																+					starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
															
 
																+				else
															
 
																+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
															
 
																+			}
															
 
																+		}
															
 
																+		starpu_mpi_data_register(a_h, tag++, 0);
															
 
																+		for (j=0;j<comm_size;j++)
															
 
																+			starpu_mpi_data_register(b_h[j], tag++, j);
															
 
																+
															
 
																+		starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
															
 
																+		starpu_fxt_start_profiling();
															
 
																+		for (work_node=1; work_node < comm_size;work_node++)
															
 
																+		{
															
 
																+			for (j=1;j<=work_coef*nworkers;j++)
															
 
																+			{
															
 
																+				if (i == 0)
															
 
																+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
															
 
																+					&mpi_work_cl,
															
 
																+					task_mode, a_h,
															
 
																+					STARPU_R, b_h[work_node],
															
 
																+					STARPU_EXECUTE_ON_NODE, work_node,
															
 
																+					0);
															
 
																+				else
															
 
																+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
															
 
																+					&work_cl,
															
 
																+					task_mode, a_h,
															
 
																+					STARPU_R, b_h[work_node],
															
 
																+					STARPU_EXECUTE_ON_NODE, work_node,
															
 
																+					0);
															
 
																+			}
															
 
																+		}
															
 
																+		starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
															
 
																+		starpu_mpi_wait_for_all(MPI_COMM_WORLD);
															
 
																+		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+		if (comm_rank == 0)
															
 
																+		{
															
 
																+			double tmp = 0.0;
															
 
																+			for (work_node = 1; work_node < comm_size ; work_node++)
															
 
																+				tmp += 1.0 / (work_node + 1.0);
															
 
																+			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
															
 
																+		}
															
 
																+		starpu_data_unregister(a_h);
															
 
																+		for (work_node=0; work_node < comm_size;work_node++)
															
 
																+			starpu_data_unregister(b_h[work_node]);
															
 
																+		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	}
															
 
																+	starpu_mpi_shutdown();
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/examples/native_fortran/nf_mpi_redux.f90
+++ b/mpi/examples/native_fortran/nf_mpi_redux.f90
@@ -0,0 +1,253 @@
 
																+! StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+!
															
 
																+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+!
															
 
																+! StarPU is free software; you can redistribute it and/or modify
															
 
																+! it under the terms of the GNU Lesser General Public License as published by
															
 
																+! the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+! your option) any later version.
															
 
																+!
															
 
																+! StarPU is distributed in the hope that it will be useful, but
															
 
																+! WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+!
															
 
																+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+!
															
 
																+program nf_mpi_redux
															
 
																+  use iso_c_binding
															
 
																+  use fstarpu_mod
															
 
																+  use fstarpu_mpi_mod
															
 
																+
															
 
																+  implicit none
															
 
																+
															
 
																+  integer, target                         :: ret, np, i, j, trial
															
 
																+  type(c_ptr)                             :: work_cl, task_rw_cl,task_red_cl, task_ini_cl
															
 
																+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
															
 
																+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
															
 
																+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
															
 
																+  real(kind(1.d0)), target                :: a,tmp
															
 
																+  real(kind(1.d0)), target, allocatable   :: b(:)
															
 
																+  integer(kind=8)                         :: tag, err
															
 
																+  type(c_ptr)                             :: ahdl
															
 
																+  type(c_ptr), target, allocatable        :: bhdl(:)
															
 
																+  type(c_ptr)                             :: task_mode, codelet_mode
															
 
																+  integer, target                         :: comm_world,comm_w_rank, comm_size
															
 
																+  integer(c_int), target                  :: w_node, nworkers, work_coef
															
 
																+
															
 
																+  call fstarpu_fxt_autostart_profiling(0)
															
 
																+  ret = fstarpu_init(c_null_ptr)
															
 
																+  ret = fstarpu_mpi_init(1)
															
 
																+
															
 
																+  comm_world = fstarpu_mpi_world_comm()
															
 
																+  comm_w_rank  = fstarpu_mpi_world_rank()
															
 
																+  comm_size  = fstarpu_mpi_world_size()
															
 
																+  if (comm_size.lt.2) then
															
 
																+    write(*,'(" ")')
															
 
																+    write(*,'("This application is meant to run with at least two nodes.")')
															
 
																+    stop 2
															
 
																+  end if
															
 
																+  allocate(b(comm_size-1), bhdl(comm_size-1))
															
 
																+  nworkers = fstarpu_worker_get_count()
															
 
																+  if (nworkers.lt.1) then
															
 
																+    write(*,'(" ")')
															
 
																+    write(*,'("This application is meant to run with at least one worker per node.")')
															
 
																+    stop 2
															
 
																+  end if
															
 
																+
															
 
																+  ! allocate and reduction codelets
															
 
																+  task_red_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(task_red_cl, namered)
															
 
																+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
															
 
																+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
															
 
																+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
															
 
																+
															
 
																+  task_ini_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
															
 
																+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
															
 
																+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
															
 
																+
															
 
																+  work_coef=2
															
 
																+
															
 
																+  do trial=1,2
															
 
																+
															
 
																+  if (trial.eq.1) then
															
 
																+        write(*,*) "Using STARPU_MPI_REDUX"
															
 
																+        codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
															
 
																+        task_mode = FSTARPU_MPI_REDUX
															
 
																+  else if (trial.eq.2) then
															
 
																+        write(*,*) "Using STARPU_REDUX"
															
 
																+        codelet_mode = FSTARPU_REDUX
															
 
																+        task_mode = FSTARPU_REDUX
															
 
																+  end if
															
 
																+  ! allocate and fill codelet structs
															
 
																+  work_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(work_cl, name)
															
 
																+  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
															
 
																+  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
															
 
																+  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+    write(*,'(" ")')
															
 
																+    a = 1.0
															
 
																+    write(*,*) "init a = ", a
															
 
																+  else
															
 
																+    b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
															
 
																+    write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank), " AT ", &
															
 
																+c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
															
 
																+  end if
															
 
																+
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+  tag = 0
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+    call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
															
 
																+    do i=1,comm_size-1
															
 
																+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
															
 
																+    end do
															
 
																+  else
															
 
																+    call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
															
 
																+    do i=1,comm_size-1
															
 
																+      if (i.eq.comm_w_rank) then
															
 
																+        call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
															
 
																+      else
															
 
																+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
															
 
																+      end if
															
 
																+    end do
															
 
																+  end if
															
 
																+  call fstarpu_mpi_data_register(ahdl,  tag,  0)
															
 
																+  do i=1,comm_size-1
															
 
																+     call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
															
 
																+  end do
															
 
																+
															
 
																+  tag = tag + comm_size
															
 
																+
															
 
																+  call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
															
 
																+
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+
															
 
																+  call fstarpu_fxt_start_profiling()
															
 
																+  do w_node=1,comm_size-1
															
 
																+    do i=1,work_coef*nworkers
															
 
																+      call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
															
 
																+             work_cl,                                         &
															
 
																+             task_mode, ahdl,                            &
															
 
																+             FSTARPU_R, bhdl(w_node),                      &
															
 
																+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
															
 
																+             C_NULL_PTR /))
															
 
																+    end do
															
 
																+  end do
															
 
																+  call fstarpu_mpi_redux_data(comm_world, ahdl)
															
 
																+  err = fstarpu_mpi_wait_for_all(comm_world)
															
 
																+
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+    tmp = 0
															
 
																+    do w_node=1,comm_size-1
															
 
																+      tmp = tmp + 1.0 / (w_node+1.0)
															
 
																+    end do
															
 
																+    write(*,*) 'computed result ---> ',a, "expected =",&
															
 
																+      1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
															
 
																+  end if
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+  call fstarpu_data_unregister(ahdl)
															
 
																+  do w_node=1,comm_size-1
															
 
																+    call fstarpu_data_unregister(bhdl(w_node))
															
 
																+  end do
															
 
																+  call fstarpu_codelet_free(work_cl)
															
 
																+
															
 
																+  end do
															
 
																+
															
 
																+  call fstarpu_fxt_stop_profiling()
															
 
																+  call fstarpu_codelet_free(task_red_cl)
															
 
																+  call fstarpu_codelet_free(task_ini_cl)
															
 
																+
															
 
																+
															
 
																+  err = fstarpu_mpi_shutdown()
															
 
																+  call fstarpu_shutdown()
															
 
																+  deallocate(b, bhdl)
															
 
																+  stop
															
 
																+
															
 
																+contains
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
															
 
																+    integer(c_int) :: ret, worker_id
															
 
																+    integer        :: comm_rank
															
 
																+    integer, target :: i
															
 
																+    real(kind(1.d0)), pointer :: a, b
															
 
																+    real(kind(1.d0))          :: old_a
															
 
																+
															
 
																+    worker_id = fstarpu_worker_get_id()
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
															
 
																+    call nf_sleep(1.d0)
															
 
																+    old_a = a
															
 
																+    a = old_a + 3.0 + b
															
 
																+    write(*,*) "task   (c_w_rank:",comm_rank," worker_id:",worker_id,") from ",old_a,"to",a
															
 
																+
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
															
 
																+    integer(c_int) :: ret, worker_id
															
 
																+    integer, target                         :: comm_rank
															
 
																+    real(kind(1.d0)), pointer :: as, ad
															
 
																+    real(kind(1.d0))           :: old_ad
															
 
																+    worker_id = fstarpu_worker_get_id()
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
															
 
																+    old_ad = ad
															
 
																+    ad = ad + as
															
 
																+    call nf_sleep(1.d0)
															
 
																+    write(*,*) "red_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,")",as, old_ad, ' ---> ',ad
															
 
																+
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task_red
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args
															
 
																+        ! cl_args is unused
															
 
																+    integer(c_int) :: ret, worker_id
															
 
																+    integer, target                         :: comm_rank
															
 
																+    real(kind(1.d0)), pointer :: a
															
 
																+    worker_id = fstarpu_worker_get_id()
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
															
 
																+    call nf_sleep(0.5d0)
															
 
																+    ! As this codelet is run by each worker in the REDUX mode case
															
 
																+    ! this initialization makes salient the number of copies spawned
															
 
																+    write(*,*) "ini_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,") set to", comm_rank, "(was",a,")"
															
 
																+    a = comm_rank
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task_ini
															
 
																+
															
 
																+  subroutine nf_sleep(t)
															
 
																+    implicit none
															
 
																+    integer :: t_start, t_end, t_rate
															
 
																+    real(kind(1.d0))     :: ta, t
															
 
																+    call system_clock(t_start)
															
 
																+    do
															
 
																+       call system_clock(t_end, t_rate)
															
 
																+       ta = real(t_end-t_start)/real(t_rate)
															
 
																+       if(ta.gt.t) return
															
 
																+    end do
															
 
																+  end subroutine nf_sleep
															
 
																+
															
 
																+end program
															
--- a/mpi/examples/native_fortran/nf_redux_test.f90
+++ b/mpi/examples/native_fortran/nf_redux_test.f90
@@ -0,0 +1,238 @@
 
																+! StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+!
															
 
																+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+!
															
 
																+! StarPU is free software; you can redistribute it and/or modify
															
 
																+! it under the terms of the GNU Lesser General Public License as published by
															
 
																+! the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+! your option) any later version.
															
 
																+!
															
 
																+! StarPU is distributed in the hope that it will be useful, but
															
 
																+! WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+!
															
 
																+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+!
															
 
																+program main
															
 
																+  use iso_c_binding
															
 
																+  use fstarpu_mod
															
 
																+  use fstarpu_mpi_mod
															
 
																+
															
 
																+  implicit none
															
 
																+
															
 
																+  integer, target                         :: ret, np, i, j
															
 
																+  type(c_ptr)                             :: task_cl, task_rw_cl, task_red_cl, task_ini_cl
															
 
																+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
															
 
																+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
															
 
																+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
															
 
																+  real(kind(1.d0)), target                :: a1, a2, b1, b2
															
 
																+  integer(kind=8)                          :: tag, err
															
 
																+  type(c_ptr)                             :: a1hdl, a2hdl, b1hdl, b2hdl
															
 
																+  integer, target                         :: comm, comm_world, comm_w_rank, comm_size
															
 
																+  integer(c_int), target                  :: w_node
															
 
																+
															
 
																+  call fstarpu_fxt_autostart_profiling(0)
															
 
																+  ret = fstarpu_init(c_null_ptr)
															
 
																+  ret = fstarpu_mpi_init(1)
															
 
																+
															
 
																+  comm_world = fstarpu_mpi_world_comm()
															
 
																+  comm_w_rank  = fstarpu_mpi_world_rank()
															
 
																+  comm_size  = fstarpu_mpi_world_size()
															
 
																+  if (comm_size.ne.4) then
															
 
																+    write(*,'(" ")')
															
 
																+    write(*,'("This application is meant to run with 4 MPI")')
															
 
																+    stop 1
															
 
																+  end if
															
 
																+  err   = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+    write(*,'(" ")')
															
 
																+    a1 = 1.0
															
 
																+    write(*,*) "init_a1", a1
															
 
																+    b1 = 0.5
															
 
																+    write(*,*) "init b1", b1
															
 
																+  end if
															
 
																+  if(comm_w_rank.eq.1) then
															
 
																+    write(*,'(" ")')
															
 
																+    a2 = 2.0
															
 
																+    write(*,*) "init_a2", a2
															
 
																+    b2 = 0.8
															
 
																+    write(*,*) "init b2", b2
															
 
																+  end if
															
 
																+
															
 
																+  ! allocate and fill codelet structs
															
 
																+  task_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(task_cl, name)
															
 
																+  call fstarpu_codelet_add_cpu_func(task_cl, C_FUNLOC(cl_cpu_task))
															
 
																+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_REDUX)
															
 
																+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_R)
															
 
																+
															
 
																+  ! allocate and reduction codelets
															
 
																+  task_red_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(task_red_cl, namered)
															
 
																+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
															
 
																+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
															
 
																+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
															
 
																+
															
 
																+  task_ini_cl = fstarpu_codelet_allocate()
															
 
																+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
															
 
																+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
															
 
																+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
															
 
																+
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+  tag = 0
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+        call fstarpu_variable_data_register(a1hdl, 0, c_loc(a1),c_sizeof(a1))
															
 
																+        call fstarpu_variable_data_register(b1hdl, 0, c_loc(b1),c_sizeof(b1))
															
 
																+  else
															
 
																+        call fstarpu_variable_data_register(a1hdl, -1, c_null_ptr,c_sizeof(a1))
															
 
																+        call fstarpu_variable_data_register(b1hdl, -1, c_null_ptr,c_sizeof(b1))
															
 
																+  end if
															
 
																+  call fstarpu_mpi_data_register(a1hdl,tag,0)
															
 
																+  call fstarpu_mpi_data_register(b1hdl, tag+1,0)
															
 
																+
															
 
																+  tag = tag + 2
															
 
																+  if(comm_w_rank.eq.1) then
															
 
																+        call fstarpu_variable_data_register(a2hdl, 0, c_loc(a2),c_sizeof(a2))
															
 
																+        call fstarpu_variable_data_register(b2hdl, 0, c_loc(b2),c_sizeof(b2))
															
 
																+  else
															
 
																+        call fstarpu_variable_data_register(a2hdl, -1, c_null_ptr,c_sizeof(a2))
															
 
																+        call fstarpu_variable_data_register(b2hdl, -1, c_null_ptr,c_sizeof(b2))
															
 
																+  end if
															
 
																+  call fstarpu_mpi_data_register(a2hdl,tag,1)
															
 
																+  call fstarpu_mpi_data_register(b2hdl, tag+1, 1)
															
 
																+  tag = tag + 2
															
 
																+
															
 
																+  call fstarpu_data_set_reduction_methods(a1hdl, task_red_cl,task_ini_cl)
															
 
																+  call fstarpu_data_set_reduction_methods(a2hdl, task_red_cl,task_ini_cl)
															
 
																+
															
 
																+  err = fstarpu_mpi_barrier(comm_world)
															
 
																+
															
 
																+  call fstarpu_fxt_start_profiling()
															
 
																+
															
 
																+  w_node = 3
															
 
																+  comm = comm_world
															
 
																+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
															
 
																+             task_cl,                                         &
															
 
																+             FSTARPU_REDUX, a1hdl,                            &
															
 
																+             FSTARPU_R, b1hdl,                                &
															
 
																+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
															
 
																+             C_NULL_PTR /))
															
 
																+  w_node = 2
															
 
																+  comm = comm_world
															
 
																+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
															
 
																+             task_cl,                                         &
															
 
																+             FSTARPU_REDUX, a2hdl,                            &
															
 
																+             FSTARPU_R, b2hdl,                                &
															
 
																+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
															
 
																+             C_NULL_PTR /))
															
 
																+
															
 
																+  call fstarpu_mpi_redux_data(comm_world, a1hdl)
															
 
																+  call fstarpu_mpi_redux_data(comm_world, a2hdl)
															
 
																+  ! write(*,*) "waiting all tasks ..."
															
 
																+  err = fstarpu_mpi_wait_for_all(comm_world)
															
 
																+
															
 
																+  if(comm_w_rank.eq.0) then
															
 
																+     write(*,*) 'computed result ---> ',a1, "expected =",4.5
															
 
																+  end if
															
 
																+  if(comm_w_rank.eq.1) then
															
 
																+     write(*,*) 'computed result ---> ',a2, "expected=",5.8
															
 
																+  end if
															
 
																+  call fstarpu_data_unregister(a1hdl)
															
 
																+  call fstarpu_data_unregister(a2hdl)
															
 
																+  call fstarpu_data_unregister(b1hdl)
															
 
																+  call fstarpu_data_unregister(b2hdl)
															
 
																+
															
 
																+  call fstarpu_fxt_stop_profiling()
															
 
																+  call fstarpu_codelet_free(task_cl)
															
 
																+  call fstarpu_codelet_free(task_red_cl)
															
 
																+  call fstarpu_codelet_free(task_ini_cl)
															
 
																+
															
 
																+
															
 
																+  err = fstarpu_mpi_shutdown()
															
 
																+  call fstarpu_shutdown()
															
 
																+
															
 
																+  stop
															
 
																+
															
 
																+contains
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
															
 
																+    integer(c_int) :: ret, worker_id
															
 
																+    integer        :: comm_rank
															
 
																+    integer, target :: i
															
 
																+    real(kind(1.d0)), pointer :: a, b
															
 
																+    real(kind(1.d0))          :: old_a
															
 
																+
															
 
																+    worker_id = fstarpu_worker_get_id()
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
															
 
																+    call nf_sleep(1.d0)
															
 
																+    old_a = a
															
 
																+    a = 3.0 + b
															
 
																+    write(*,*) "task   (c_w_rank:",comm_rank,") from ",old_a,"to",a
															
 
																+
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
															
 
																+    integer(c_int) :: ret
															
 
																+    integer, target                         :: comm_rank
															
 
																+    real(kind(1.d0)), pointer :: as, ad
															
 
																+    real(kind(1.d0))           :: old_ad
															
 
																+
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
															
 
																+    old_ad = ad
															
 
																+    ad = ad + as
															
 
																+    call nf_sleep(1.d0)
															
 
																+    write(*,*) "red_cl (c_w_rank:",comm_rank,")",as, old_ad, ' ---> ',ad
															
 
																+
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task_red
															
 
																+
															
 
																+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
															
 
																+    use iso_c_binding       ! C interfacing module
															
 
																+    use fstarpu_mod         ! StarPU interfacing module
															
 
																+    implicit none
															
 
																+
															
 
																+    type(c_ptr), value, intent(in) :: buffers, cl_args
															
 
																+        ! cl_args is unused
															
 
																+    integer(c_int) :: ret
															
 
																+    integer, target                         :: comm_rank
															
 
																+    real(kind(1.d0)), pointer :: a
															
 
																+
															
 
																+    comm_rank  = fstarpu_mpi_world_rank()
															
 
																+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
															
 
																+    call nf_sleep(0.5d0)
															
 
																+    a = 0.0
															
 
																+    write(*,*) "ini_cl (c_w_rank:",comm_rank,")"
															
 
																+    return
															
 
																+  end subroutine cl_cpu_task_ini
															
 
																+
															
 
																+  subroutine nf_sleep(t)
															
 
																+    implicit none
															
 
																+    integer :: t_start, t_end, t_rate
															
 
																+    real(kind(1.d0))     :: ta, t
															
 
																+    call system_clock(t_start)
															
 
																+    do
															
 
																+       call system_clock(t_end, t_rate)
															
 
																+       ta = real(t_end-t_start)/real(t_rate)
															
 
																+       if(ta.gt.t) return
															
 
																+    end do
															
 
																+  end subroutine nf_sleep
															
 
																+
															
 
																+end program main
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -232,6 +232,11 @@ int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, s
 
																 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
															
 
																 /**
															
 
																+   Same of starpu_mpi_irecv_detached but with the \p prio parameter.
															
 
																+*/
															
 
																+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
															
 
																+
															
 
																+/**
															
 
																    Post a nonblocking receive in \p data_handle from the node \p
															
 
																    source using the message tag \p data_tag within the communicator \p
															
 
																    comm. On completion, the \p callback function is called with the
															
@@ -561,6 +566,10 @@ int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
 
																    Return the tag of the given data.
															
 
																 */
															
 
																 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
															
 
																+/**
															
 
																+   Return the redux map of the given data.
															
 
																+*/
															
 
																+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t handle);
															
 
																 /**
															
 
																    Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()
															
--- a/mpi/src/mpi/starpu_mpi_early_data.h
+++ b/mpi/src/mpi/starpu_mpi_early_data.h
@@ -40,7 +40,6 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 
																 	  void *buffer;
															
 
																 	  size_t size;
															
 
																 	  unsigned buffer_node;
															
 
																-	  int req_ready;
															
 
																 	  struct _starpu_mpi_node_tag node_tag;
															
 
																 	  starpu_pthread_mutex_t req_mutex;
															
 
																 	  starpu_pthread_cond_t req_cond;
															
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -50,6 +50,9 @@ static unsigned nready_process;
 
																 /* Number of send requests to submit to MPI at the same time */
															
 
																 static unsigned ndetached_send;
															
 
																+/* Force allocation of early data */
															
 
																+static int early_data_force_allocate;
															
 
																+
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 static void _starpu_mpi_add_sync_point_in_fxt(void);
															
 
																 #endif
															
@@ -81,6 +84,11 @@ static starpu_pthread_t progress_thread;
 
																 #endif
															
 
																 static int running = 0;
															
 
																+/* Provides synchronization between an early request, a sync request, and an early data handle:
															
 
																+ * we keep it held while checking and posting one to prevent the other.
															
 
																+ * This is to be taken always before the progress_mutex. */
															
 
																+static starpu_pthread_mutex_t early_data_mutex;
															
 
																+
															
 
																 /* Driver taken by StarPU-MPI to process tasks when there is no requests to
															
 
																  * handle instead of polling endlessly */
															
 
																 static struct starpu_driver *mpi_driver = NULL;
															
@@ -103,7 +111,7 @@ static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for
 
																 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
															
 
																 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
															
 
																-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
															
 
																+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 #pragma weak smpi_simulated_main_
															
@@ -182,8 +190,6 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																-
															
 
																 	if (req->request_type == RECV_REQ)
															
 
																 	{
															
 
																 		/* Case : the request is the internal receive request submitted
															
@@ -206,6 +212,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
															
 
																 			}
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
															
 
																 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
															
 
																 					  req->datatype_name, (int)req->count, req->registered_datatype);
															
@@ -213,31 +220,24 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 			_STARPU_MPI_INC_READY_REQUESTS(+1);
															
 
																 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
															
 
																-			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																-			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->posted_mutex);
															
 
																 			req->posted = 1;
															
 
																 			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
															
 
																-			STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->posted_mutex);
															
 
																-			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
															
 
																 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
															
 
																 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
															
 
																 			if (early_data_handle)
															
 
																 			{
															
 
																+				/* Got the early_data_handle */
															
 
																+				STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																+
															
 
																 				/* Case: a receive request for a data with the given tag and source has already been
															
 
																 				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
															
 
																 				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
															
 
																 				 * will be called to bring the data back to the original data handle associated to the request.*/
															
 
																-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																-				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
															
 
																-				while (!(early_data_handle->req_ready))
															
 
																-					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
															
 
																-				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
															
 
																-				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																-
															
 
																 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
															
 
																 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
															
@@ -254,9 +254,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 				cb_args->req = req;
															
 
																 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
															
 
																-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																 				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
															
 
																-				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL);
															
 
																+				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL, req->prio);
															
 
																 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 			}
															
 
																 			else
															
@@ -265,6 +264,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
															
 
																 				if (sync_req)
															
 
																 				{
															
 
																+					/* Got the sync req */
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																 					/* Case: we already received the send envelope, we can proceed with the receive */
															
 
																 					req->sync = 1;
															
 
																 					_starpu_mpi_datatype_allocate(req->data_handle, req);
															
@@ -279,6 +280,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 						STARPU_ASSERT(req->count);
															
 
																 						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
															
 
																 					}
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
															
 
																 					_STARPU_MPI_INC_READY_REQUESTS(+1);
															
 
																 					/* Throw away the dumb request that was only used to know that we got the envelope */
															
@@ -288,13 +290,17 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
																 				{
															
 
																 					/* Case: no matching data has been received. Store the receive request as an early_request. */
															
 
																 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 					_starpu_mpi_early_request_enqueue(req);
															
 
																+					/* We have queued our early request, we can let the progression thread look at it */
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																 				}
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 		if (req->request_type == SEND_REQ)
															
 
																 			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
															
 
																 		else
															
@@ -1157,13 +1163,11 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
																 	_starpu_mpi_early_data_add(early_data_handle);
															
 
																 	starpu_data_handle_t data_handle;
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 	// TODO: rather select some memory node next to the NIC
															
 
																 	unsigned buffer_node = STARPU_MAIN_RAM;
															
 
																-	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID)
															
 
																+	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID && !early_data_force_allocate)
															
 
																 	{
															
 
																 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
															
 
																 		early_data_handle->buffer = NULL;
															
@@ -1190,25 +1194,16 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
															
 
																 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
															
 
																-							  NULL, NULL, 1, 1, envelope->size);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																+							  NULL, NULL, 1, 1, envelope->size, STARPU_DEFAULT_PRIO);
															
 
																+	/* The early data handle is ready, we can let _starpu_mpi_submit_ready_request
															
 
																+	 * proceed with acquiring it */
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 	// We wait until the request is pushed in the
															
 
																 	// ready_request list
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->backend->posted_mutex));
															
 
																 	while (!(early_data_handle->req->posted))
															
 
																-		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &(early_data_handle->req->backend->posted_mutex));
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->backend->posted_mutex));
															
 
																-
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning check if req_ready is still necessary
															
 
																-#endif
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
															
 
																-	early_data_handle->req_ready = 1;
															
 
																-	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																+		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &progress_mutex);
															
 
																 	// Handle the request immediatly to make sure the mpi_irecv is
															
 
																 	// posted before receiving an other envelope
															
@@ -1421,6 +1416,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 				{
															
 
																 					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
															
 
																 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
															
 
																 					/* Case: a data will arrive before a matching receive is
															
@@ -1453,9 +1451,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 							new_req->backend->is_internal_req = 0; // ????
															
 
																 							new_req->count = envelope->size;
															
 
																 							_starpu_mpi_sync_data_add(new_req);
															
 
																+							/* We have queued our sync request, we can let _starpu_mpi_submit_ready_request find it */
															
 
																+							STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																 						}
															
 
																 						else
															
 
																 						{
															
 
																+							/* This will release early_data_mutex when appropriate */
															
 
																 							_starpu_mpi_receive_early_data(envelope, envelope_status, envelope_comm);
															
 
																 						}
															
 
																 					}
															
@@ -1466,6 +1467,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 					 * _starpu_mpi_handle_ready_request. */
															
 
																 					else
															
 
																 					{
															
 
																+						/* Got the early request */
															
 
																+						STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
															
 
																 						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
															
 
																 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
															
@@ -1621,6 +1624,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
																 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
															
 
																 {
															
 
																         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
															
 
																+        STARPU_PTHREAD_MUTEX_INIT(&early_data_mutex, NULL);
															
 
																         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
															
 
																         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
															
 
																 	_starpu_mpi_req_list_init(&ready_recv_requests);
															
@@ -1634,6 +1638,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
																 	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
															
 
																 	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
															
 
																+	early_data_force_allocate = starpu_get_env_number_default("STARPU_MPI_EARLYDATA_ALLOCATE", 0);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
															
@@ -1688,6 +1693,7 @@ void _starpu_mpi_progress_shutdown(void **value)
 
																         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
															
 
																         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_ready_requests);
															
 
																         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
															
 
																+        STARPU_PTHREAD_MUTEX_DESTROY(&early_data_mutex);
															
 
																         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
															
 
																 }
															
--- a/mpi/src/mpi/starpu_mpi_mpi_backend.c
+++ b/mpi/src/mpi/starpu_mpi_mpi_backend.c
@@ -54,7 +54,6 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 
																 	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
															
 
																 	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
															
 
																-	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->posted_mutex, NULL);
															
 
																 	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
															
 
																 	//req->backend->other_request = NULL;
															
@@ -80,7 +79,6 @@ void _starpu_mpi_mpi_backend_request_destroy(struct _starpu_mpi_req *req)
 
																 {
															
 
																 	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
															
 
																 	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
															
 
																-	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->posted_mutex);
															
 
																 	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
															
 
																 	free(req->backend);
															
 
																 	req->backend = NULL;
															
--- a/mpi/src/mpi/starpu_mpi_mpi_backend.h
+++ b/mpi/src/mpi/starpu_mpi_mpi_backend.h
@@ -54,7 +54,6 @@ struct _starpu_mpi_req_backend
 
																 	starpu_pthread_mutex_t req_mutex;
															
 
																 	starpu_pthread_cond_t req_cond;
															
 
																-	starpu_pthread_mutex_t posted_mutex;
															
 
																 	starpu_pthread_cond_t posted_cond;
															
 
																 	/** In the case of a Wait/Test request, we are going to post a request
															
 
																 	 * to test the completion of another request */
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -161,12 +161,12 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 
																 	if (sequential_consistency)
															
 
																 	{
															
 
																-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
															
 
																+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		/* post_sync_job_id has already been filled */
															
 
																-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
															
 
																+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
															
 
																 	}
															
 
																 }
															
@@ -289,7 +289,7 @@ int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starp
 
																 	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
															
 
																 }
															
 
																-struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
															
 
																+struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
															
 
																 {
															
 
																 	if (_starpu_mpi_fake_world_size != -1)
															
 
																 	{
															
@@ -297,7 +297,7 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 
																 		return NULL;
															
 
																 	}
															
 
																-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
															
 
																+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
															
 
																 	_starpu_mpi_req_willpost(req);
															
 
																 	if (sequential_consistency == 0)
															
@@ -317,7 +317,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
																 	struct _starpu_mpi_req *req;
															
 
																 	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
															
 
																-	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
															
 
																+	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
															
 
																 	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
															
 
																 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
															
@@ -331,7 +331,17 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, star
 
																 {
															
 
																 	_STARPU_MPI_LOG_IN();
															
 
																-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
															
 
																+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
															
 
																+{
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
															
 
																+
															
 
																 	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
 
																 }
															
@@ -340,7 +350,7 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 
																 {
															
 
																 	_STARPU_MPI_LOG_IN();
															
 
																-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
															
 
																+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
															
 
																 	_STARPU_MPI_LOG_OUT();
															
 
																 	return 0;
															
@@ -379,10 +389,13 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
																 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
															
 
																 {
															
 
																+	struct _starpu_mpi_data *data = data_handle->mpi_data;
															
 
																 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
															
 
																 	_starpu_mpi_cache_data_clear(data_handle);
															
 
																-	_starpu_spin_destroy(&((struct _starpu_mpi_data*) data_handle->mpi_data)->coop_lock);
															
 
																-	free(data_handle->mpi_data);
															
 
																+	_starpu_spin_destroy(&data->coop_lock);
															
 
																+	if (data->redux_map != REDUX_CONTRIB)
															
 
																+		free(data->redux_map);
															
 
																+	free(data);
															
 
																 	data_handle->mpi_data = NULL;
															
 
																 }
															
@@ -448,6 +461,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
																 	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
															
 
																 }
															
 
																+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
															
 
																+	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
															
 
																+}
															
 
																+
															
 
																 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
															
 
																 {
															
 
																 	int me, rank;
															
--- a/mpi/src/starpu_mpi_coop_sends.c
+++ b/mpi/src/starpu_mpi_coop_sends.c
@@ -297,8 +297,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 
																 	if (first)
															
 
																 		/* We were first, we are responsible for acquiring the data for everybody */
															
 
																-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
															
 
																+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL, req->prio);
															
 
																 	else
															
 
																 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
															
 
																 }
															
 
																-
															
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -118,7 +118,7 @@ int _starpu_debug_rank;
 
																 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
															
 
																 			fflush(stderr);	\
															
 
																 		} \
															
 
																-	} while(0);
															
 
																+	} while(0)
															
 
																 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
															
 
																 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
															
 
																 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
															
@@ -130,7 +130,7 @@ int _starpu_debug_rank;
 
																 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
															
 
																 			fflush(stderr); \
															
 
																 		} \
															
 
																-	} while(0);
															
 
																+	} while(0)
															
 
																 #else
															
 
																 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
															
 
																 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
															
@@ -141,10 +141,10 @@ int _starpu_debug_rank;
 
																 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
															
 
																 	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
															
 
																                                              fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
															
 
																-                                             fflush(stderr); }} while(0);
															
 
																+                                             fflush(stderr); }} while(0)
															
 
																 #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
															
 
																                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
															
 
																-                                             fflush(stderr); } while(0);
															
 
																+                                             fflush(stderr); } while(0)
															
 
																 #ifdef STARPU_MPI_EXTRA_VERBOSE
															
 
																 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
															
@@ -203,6 +203,12 @@ struct _starpu_mpi_coop_sends
 
																 	long pre_sync_jobid;
															
 
																 };
															
 
																+/** cf. redux_map field : this is the value
															
 
																+ * put in this field whenever a node contributes
															
 
																+ * to the reduction of the data.
															
 
																+ * Only the owning node keeps track of all the contributing nodes. */
															
 
																+#define REDUX_CONTRIB ((char*) -1)
															
 
																+
															
 
																 /** Initialized in starpu_mpi_data_register_comm */
															
 
																 struct _starpu_mpi_data
															
 
																 {
															
@@ -211,8 +217,12 @@ struct _starpu_mpi_data
 
																 	char *cache_sent;
															
 
																 	int cache_received;
															
 
																-	/** Rendez-vous data for opportunistic cooperative sends */
															
 
																-	/** Needed to synchronize between submit thread and workers */
															
 
																+	/** Array used to store the contributing nodes to this data
															
 
																+	  * when it is accessed in REDUX mode. */
															
 
																+	char* redux_map;
															
 
																+
															
 
																+	/** Rendez-vous data for opportunistic cooperative sends,
															
 
																+	  * Needed to synchronize between submit thread and workers */
															
 
																 	struct _starpu_spinlock coop_lock;
															
 
																 	/** Current cooperative send bag */
															
 
																 	struct _starpu_mpi_coop_sends *coop_sends;
															
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -100,7 +100,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
																 	{
															
 
																 		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
															
 
																 	}
															
 
																-	if (data && mode & STARPU_R)
															
 
																+	if (data && mode & STARPU_R && !(mode & STARPU_MPI_REDUX))
															
 
																 	{
															
 
																 		int mpi_rank = starpu_mpi_data_get_rank(data);
															
 
																 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
															
@@ -118,7 +118,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
																 				if (data_tag == -1)
															
 
																 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
															
 
																 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
															
 
																-				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
															
 
																+				starpu_mpi_irecv_detached_prio(data, mpi_rank, data_tag, prio, comm, NULL, NULL);
															
 
																 			}
															
 
																 			// else the node has already received the data
															
 
																 		}
															
@@ -142,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
																 static
															
 
																 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
															
 
																 {
															
 
																-	if (mode & STARPU_W)
															
 
																+	if (mode & STARPU_W && !(mode & STARPU_MPI_REDUX))
															
 
																 	{
															
 
																 		int mpi_rank = starpu_mpi_data_get_rank(data);
															
 
																 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
															
@@ -179,7 +179,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
																 {
															
 
																 	if (_starpu_cache_enabled)
															
 
																 	{
															
 
																-		if (mode & STARPU_W || mode & STARPU_REDUX)
															
 
																+		if ((mode & STARPU_W && !(mode & STARPU_MPI_REDUX)) || mode & STARPU_REDUX)
															
 
																 		{
															
 
																 			/* The data has been modified, it MUST be removed from the cache */
															
 
																 			starpu_mpi_cached_send_clear(data);
															
@@ -189,7 +189,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
																 	else
															
 
																 	{
															
 
																 		/* We allocated a temporary buffer for the received data, now drop it */
															
 
																-		if ((mode & STARPU_R) && do_execute)
															
 
																+		if ((mode & STARPU_R && !(mode & STARPU_MPI_REDUX)) && do_execute)
															
 
																 		{
															
 
																 			int mpi_rank = starpu_mpi_data_get_rank(data);
															
 
																 			if (mpi_rank == STARPU_MPI_PER_NODE)
															
@@ -254,7 +254,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 
																 				inconsistent_execute = 0;
															
 
																 			}
															
 
																 		}
															
 
																-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
															
 
																+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
															
 
																 		{
															
 
																 			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
															
 
																 			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
															
@@ -617,6 +617,20 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
																 	for(i=0 ; i<nb_data ; i++)
															
 
																 	{
															
 
																+		if ((descrs[i].mode & STARPU_REDUX || descrs[i].mode & STARPU_MPI_REDUX) && descrs[i].handle)
															
 
																+		{
															
 
																+			struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
															
 
																+			if (me == starpu_mpi_data_get_rank(descrs[i].handle))
															
 
																+			{
															
 
																+				int size;
															
 
																+				starpu_mpi_comm_size(comm, &size);
															
 
																+				if (mpi_data->redux_map == NULL)
															
 
																+					_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
															
 
																+				mpi_data->redux_map [xrank] = 1;
															
 
																+			}
															
 
																+			else if (me == xrank)
															
 
																+				mpi_data->redux_map = REDUX_CONTRIB;
															
 
																+		}
															
 
																 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
															
 
																 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
															
 
																 	}
															
@@ -813,6 +827,11 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args,
 
																 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
															
 
																  * a data previously accessed in REDUX mode gets accessed in R mode. */
															
 
																+/* FIXME: In order to prevent simultaneous receive submissions
															
 
																+ * on the same handle, we need to wait that all the starpu_mpi
															
 
																+ * tasks are done before submitting next tasks. The current
															
 
																+ * version of the implementation does not support multiple
															
 
																+ * simultaneous receive requests on the same handle.*/
															
 
																 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
															
 
																 {
															
 
																 	int me, rank, nb_nodes;
															
@@ -820,6 +839,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
																 	rank = starpu_mpi_data_get_rank(data_handle);
															
 
																 	data_tag = starpu_mpi_data_get_tag(data_handle);
															
 
																+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
															
 
																 	if (rank == -1)
															
 
																 	{
															
 
																 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
															
@@ -832,12 +852,16 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
																 	starpu_mpi_comm_rank(comm, &me);
															
 
																 	starpu_mpi_comm_size(comm, &nb_nodes);
															
 
																-	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
															
 
																-
															
 
																+	_STARPU_MPI_DEBUG(50, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
															
 
																 	// need to count how many nodes have the data in redux mode
															
 
																 	if (me == rank)
															
 
																 	{
															
 
																-		int i;
															
 
																+		int i,j;
															
 
																+		_STARPU_MPI_DEBUG(50, "Who is in the map ?\n");
															
 
																+		for (j = 0; j<nb_nodes; j++)
															
 
																+		{
															
 
																+			_STARPU_MPI_DEBUG(50, "%d is in the map ? %d\n", j, mpi_data->redux_map[j]);
															
 
																+		}
															
 
																 		// taskC depends on all taskBs created
															
 
																 		// Creating synchronization task and use its jobid for tracing
															
@@ -848,8 +872,9 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
																 		for(i=0 ; i<nb_nodes ; i++)
															
 
																 		{
															
 
																-			if (i != rank)
															
 
																+			if (i != rank && mpi_data->redux_map[i])
															
 
																 			{
															
 
																+				_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p \n", i, data_handle);
															
 
																 				/* We need to make sure all is
															
 
																 				 * executed after data_handle finished
															
 
																 				 * its last read access, we hence do
															
@@ -893,24 +918,34 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
																 						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
															
 
																 						   0);
															
 
																 			}
															
 
																+			else
															
 
																+			{
															
 
																+				_STARPU_MPI_DEBUG(5, "%d is not in the map or is me\n", i);
															
 
																+			}
															
 
																 		}
															
 
																 		int ret = starpu_task_submit(taskC);
															
 
																 		STARPU_ASSERT(ret == 0);
															
 
																 	}
															
 
																-	else
															
 
																+	else if (mpi_data->redux_map)
															
 
																 	{
															
 
																-		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
															
 
																+		STARPU_ASSERT(mpi_data->redux_map == REDUX_CONTRIB);
															
 
																+		_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", rank);
															
 
																 		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
															
 
																-		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
															
 
																+		starpu_data_invalidate_submit(data_handle);
															
 
																 	}
															
 
																-	/* FIXME: In order to prevent simultaneous receive submissions
															
 
																-	 * on the same handle, we need to wait that all the starpu_mpi
															
 
																-	 * tasks are done before submitting next tasks. The current
															
 
																-	 * version of the implementation does not support multiple
															
 
																-	 * simultaneous receive requests on the same handle.*/
															
 
																-	starpu_task_wait_for_all();
															
 
																-
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_MPI_DEBUG(5, "I am not in the map of %d, I am %d ...\n", rank, me);
															
 
																+	}
															
 
																+	if (mpi_data->redux_map != NULL)
															
 
																+	{
															
 
																+		_STARPU_MPI_DEBUG(100, "waiting for redux tasks with %d\n", rank);
															
 
																+		starpu_task_wait_for_all();
															
 
																+	}
															
 
																+	if (me == rank)
															
 
																+		free(mpi_data->redux_map);
															
 
																+	mpi_data->redux_map = NULL;
															
 
																 }
															
 
																 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
															
 
																 {
															
--- a/mpi/src/starpu_mpi_task_insert_fortran.c
+++ b/mpi/src/starpu_mpi_task_insert_fortran.c
@@ -74,7 +74,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 
																 				inconsistent_execute = 0;
															
 
																 			}
															
 
																 		}
															
 
																-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
															
 
																+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
															
 
																 		{
															
 
																 			arg_i++;
															
 
																 			starpu_data_handle_t data = arglist[arg_i];
															
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -37,7 +37,7 @@ static struct starpu_codelet init_codelet =
 
																 static struct starpu_codelet redux_codelet =
															
 
																 {
															
 
																 	.cpu_funcs = {redux_cpu_func},
															
 
																-	.modes = {STARPU_RW, STARPU_R},
															
 
																+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
															
 
																 	.nbuffers = 2,
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	.model = &starpu_perfmodel_nop,
															
--- a/mpi/tests/mpi_redux.c
+++ b/mpi/tests/mpi_redux.c
@@ -14,6 +14,9 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																+/* This test does a manual reduction: all ranks send a number to the rank 0,
															
 
																+ * the rank 0 sums these numbers and sends back the result to all ranks. */
															
 
																+
															
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -342,7 +342,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE1STR
															
@@ -356,7 +356,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE2STR
															
@@ -377,7 +377,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE2STR
															
@@ -388,7 +388,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE3STR
															
@@ -410,7 +410,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE3STR
															
@@ -421,7 +421,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE4STR
															
@@ -444,7 +444,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE4STR
															
@@ -455,7 +455,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE5STR
															
@@ -479,7 +479,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE5STR
															
@@ -490,7 +490,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE6STR
															
@@ -515,7 +515,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE6STR
															
@@ -526,7 +526,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_ALWAYS_PROBE7STR
															
@@ -552,7 +552,7 @@ do {									\
 
																 	snprintf((char *)futargs, len, "%s", str);			\
															
 
																 	((char *)futargs)[len - 1] = '\0';				\
															
 
																 	_STARPU_FUT_COMMIT(total_len);					\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifdef FUT_FULL_PROBE7STR
															
@@ -563,7 +563,7 @@ do {									\
 
																     if(KEYMASK & fut_active) {						\
															
 
																 	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
															
 
																     }									\
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #endif
															
 
																 #ifndef FUT_RAW_PROBE7
															
@@ -787,7 +787,7 @@ do {									\
 
																 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
															
 
																 		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
															
 
																 	}								\
															
 
																-} while(0);
															
 
																+} while(0)
															
 
																 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
															
 
																 do {									\
															
@@ -796,7 +796,7 @@ do {									\
 
																 	char _archname[32]=""; \
															
 
																 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
															
 
																 	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
															
 
																-} while(0);
															
 
																+} while(0)
															
 
																 #define _STARPU_TRACE_START_EXECUTING()				\
															
 
																 	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
															
@@ -898,7 +898,7 @@ do {										\
 
																 	else {									\
															
 
																 		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
															
 
																 	}									\
															
 
																-} while(0);
															
 
																+} while(0)
															
 
																 #define _STARPU_TRACE_DATA_NAME(handle, name) \
															
 
																 	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_DATA_NAME, handle, name)
															
@@ -1319,8 +1319,8 @@ do {										\
 
																 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
															
 
																        FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
															
 
																-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre)          \
															
 
																-       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
															
 
																+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req)          \
															
 
																+       FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre, req)
															
 
																 #else // !STARPU_USE_FXT
															
@@ -1451,7 +1451,7 @@ do {										\
 
																 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)	do {(void)(handle); (void)(node);} while(0)
															
 
																 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
															
 
																 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
															
 
																-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre);} while(0)
															
 
																+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre); (void)(req); } while(0)
															
 
																 #define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
															
 
																 #endif // STARPU_USE_FXT
															
--- a/src/common/hash.c
+++ b/src/common/hash.c
@@ -46,6 +46,11 @@ uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 
																 	return crc;
															
 
																 }
															
 
																+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc)
															
 
																+{
															
 
																+	return starpu_hash_crc32c_be_n(&input, sizeof(input), inputcrc);
															
 
																+}
															
 
																+
															
 
																 uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
															
 
																 {
															
 
																 	uint8_t *p = (uint8_t *)&input;
															
--- a/src/common/uthash.h
+++ b/src/common/uthash.h
@@ -104,12 +104,12 @@ do {
 
																   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
															
 
																   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
															
 
																   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #define HASH_BLOOM_FREE(tbl)                                                     \
															
 
																 do {                                                                             \
															
 
																   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
															
 
																-} while (0);
															
 
																+} while (0)
															
 
																 #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
															
 
																 #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
															
@@ -368,7 +368,7 @@ do {
 
																   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
															
 
																       hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
															
 
																   bkt = hashv & (num_bkts-1);                                                    \
															
 
																-} while(0);
															
 
																+} while(0)
															
 
																 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
															
 
																 do {                                                                             \
															
@@ -507,7 +507,7 @@ do {
 
																     hashv ^= hashv << 25;                                                        \
															
 
																     hashv += hashv >> 6;                                                         \
															
 
																     bkt = hashv & (num_bkts-1);                                                  \
															
 
																-} while(0);
															
 
																+} while(0)
															
 
																 #ifdef HASH_USING_NO_STRICT_ALIASING
															
 
																 /* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.
															
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
																 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 		{
															
 
																 			cpt++;
															
 
																-			_starpu_datawizard_progress(0);
															
 
																+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
															
 
																 		}
															
 
																 		if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 			_starpu_spin_lock(&handle->header_lock);
															
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
																 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 		{
															
 
																 			cpt++;
															
 
																-			_starpu_datawizard_progress(0);
															
 
																+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
															
 
																 		}
															
 
																 		if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 			_starpu_spin_lock(&handle->header_lock);
															
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 
																 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 		{
															
 
																 			cpt++;
															
 
																-			_starpu_datawizard_progress(0);
															
 
																+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
															
 
																 		}
															
 
																 		if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 			_starpu_spin_lock(&handle->header_lock);
															
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -225,8 +225,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
																 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
															
 
																 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
															
 
																-		if (mode & STARPU_R)
															
 
																-			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
															
 
																+		if (mode & STARPU_R && !handle->initialized)
															
 
																+		{
															
 
																+			STARPU_ASSERT_MSG(handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
															
 
																+			/* The task will initialize it with init_cl */
															
 
																+			handle->initialized = 1;
															
 
																+		}
															
 
																 		if (mode & STARPU_W || mode == STARPU_REDUX)
															
 
																 		{
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -288,8 +288,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 	{
															
 
																 		unsigned long jobs = STARPU_ATOMIC_ADDL(&njobs_finished, 1);
															
 
																-		printf("\r%lu tasks finished...", jobs);
															
 
																-		fflush(stdout);
															
 
																+		fprintf(stderr,"\r%lu tasks finished (last %lu %p)...", jobs, j->job_id, j->task);
															
 
																 	}
															
 
																 	struct starpu_task *task = j->task;
															
--- a/src/core/perfmodel/energy_model.c
+++ b/src/core/perfmodel/energy_model.c
@@ -43,7 +43,7 @@
 
																 #endif
															
 
																 #endif
															
 
																-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
															
 
																+#define ERROR_RETURN(retval, function) do { PAPI_perror(function); fprintf(stderr, "Error %d %s:line %d\n", retval,__FILE__,__LINE__);  return(retval); } while (0)
															
 
																 #if 0
															
 
																 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
															
@@ -52,6 +52,7 @@
 
																 #endif
															
 
																 #ifdef STARPU_PAPI
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																 static const int N_EVTS = 2;
															
 
																 static int nsockets;
															
@@ -68,7 +69,7 @@ static int add_event(int EventSet, int socket);
 
																 /*must be initialized to PAPI_NULL before calling PAPI_create_event*/
															
 
																 static int EventSet = PAPI_NULL;
															
 
																-
															
 
																+#endif
															
 
																 #endif
															
 
																 static double t1;
															
@@ -80,7 +81,7 @@ static nvmlDevice_t device;
 
																 #endif
															
 
																 #endif
															
 
																-int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
															
 
																+int starpu_energy_start(int workerid STARPU_ATTRIBUTE_UNUSED, enum starpu_worker_archtype archi)
															
 
																 {
															
 
																 	t1 = starpu_timing_now();
															
@@ -100,11 +101,11 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 
																 		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
															
 
																 		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_library_init");
															
 
																 		/* Creating the eventset */
															
 
																 		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_create_eventset");
															
 
																 		int i;
															
 
																 		for (i = 0 ; i < nsockets ; i ++ )
															
@@ -112,19 +113,25 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 
																 			/* return the index of socket */
															
 
																 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
															
 
																 			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
															
 
																-				ERROR_RETURN(retval);
															
 
																+			{
															
 
																+				if (retval == PAPI_EPERM)
															
 
																+					_STARPU_DISP("PAPI could not access counters due to permissions errors. Perhaps your system requires to run measurements as root?\n");
															
 
																+				else if (retval == PAPI_ENOEVNT)
															
 
																+					_STARPU_DISP("PAPI could not access counters. Perhaps your system requires to run measurements as root?\n");
															
 
																+				ERROR_RETURN(retval, "PAPI_add_named_event");
															
 
																+			}
															
 
																 		}
															
 
																 		/* get the number of events in the event set */
															
 
																 		number = 0;
															
 
																 		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_list_events");
															
 
																 		debug("There are %d events in the event set\n", number);
															
 
																 		/* Start counting */
															
 
																 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_start");
															
 
																 		return retval;
															
 
																 	}
															
@@ -180,7 +187,7 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
																 		/* Stop counting and store the values into the array */
															
 
																 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_stop");
															
 
																 		int k,s;
															
@@ -199,11 +206,11 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
																 		/*removes all events from a PAPI event set */
															
 
																 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_cleanup_eventset");
															
 
																 		/*deallocates the memory associated with an empty PAPI EventSet*/
															
 
																 		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
															
 
																-			ERROR_RETURN(retval);
															
 
																+			ERROR_RETURN(retval, "PAPI_destroy_eventset");
															
 
																 		break;
															
 
																 	}
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1328,7 +1328,7 @@ static void write_bus_latency_file_content(void)
 
																 	_STARPU_DEBUG("writing latencies to %s\n", path);
															
 
																-	f = fopen(path, "w+");
															
 
																+	f = fopen(path, "a+");
															
 
																 	if (!f)
															
 
																 	{
															
 
																 		perror("fopen write_bus_latency_file_content");
															
@@ -1337,6 +1337,7 @@ static void write_bus_latency_file_content(void)
 
																 		STARPU_ABORT();
															
 
																 	}
															
 
																 	locked = _starpu_fwrlock(f) == 0;
															
 
																+	fseek(f, 0, SEEK_SET);
															
 
																 	_starpu_fftruncate(f, 0);
															
 
																 	fprintf(f, "# ");
															
@@ -1684,10 +1685,11 @@ static void write_bus_bandwidth_file_content(void)
 
																 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
															
 
																-	f = fopen(path, "w+");
															
 
																+	f = fopen(path, "a+");
															
 
																 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
															
 
																 	locked = _starpu_fwrlock(f) == 0;
															
 
																+	fseek(f, 0, SEEK_SET);
															
 
																 	_starpu_fftruncate(f, 0);
															
 
																 	fprintf(f, "# ");
															
@@ -2124,9 +2126,10 @@ static void write_bus_config_file_content(void)
 
																 	_STARPU_DEBUG("writing config to %s\n", path);
															
 
																-	f = fopen(path, "w+");
															
 
																+	f = fopen(path, "a+");
															
 
																 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
															
 
																 	locked = _starpu_fwrlock(f) == 0;
															
 
																+	fseek(f, 0, SEEK_SET);
															
 
																 	_starpu_fftruncate(f, 0);
															
 
																 	fprintf(f, "# Current configuration\n");
															
@@ -2655,7 +2658,7 @@ static void write_bus_platform_file_content(int version)
 
																 	_STARPU_DEBUG("writing platform to %s\n", path);
															
 
																-	f = fopen(path, "w+");
															
 
																+	f = fopen(path, "a+");
															
 
																 	if (!f)
															
 
																 	{
															
 
																 		perror("fopen write_bus_platform_file_content");
															
@@ -2664,6 +2667,7 @@ static void write_bus_platform_file_content(int version)
 
																 		STARPU_ABORT();
															
 
																 	}
															
 
																 	locked = _starpu_fwrlock(f) == 0;
															
 
																+	fseek(f, 0, SEEK_SET);
															
 
																 	_starpu_fftruncate(f, 0);
															
 
																 	fprintf(f,
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1177,11 +1177,12 @@ void starpu_save_history_based_model(struct starpu_perfmodel *model)
 
																 	/* overwrite existing file, or create it */
															
 
																 	FILE *f;
															
 
																-	f = fopen(path, "w+");
															
 
																+	f = fopen(path, "a+");
															
 
																 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
															
 
																 	locked = _starpu_fwrlock(f) == 0;
															
 
																 	check_model(model);
															
 
																+	fseek(f, 0, SEEK_SET);
															
 
																 	_starpu_fftruncate(f, 0);
															
 
																 	dump_model_file(f, model);
															
 
																 	if (locked)
															
@@ -1610,10 +1611,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 
																 	}
															
 
																 	regmodel = &model->state->per_arch[comb][nimpl].regression;
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
															
 
																 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
															
 
																                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
															
 
																+	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
															
 
																 docal:
															
 
																 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
															
@@ -1654,8 +1655,8 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
																 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
															
 
																 	{
															
 
																-		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
															
 
																 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
															
 
																+		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -206,7 +206,7 @@ struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_c
 
																 	if (selected_policy)
															
 
																 		return selected_policy;
															
 
																-	/* If no policy was specified, we use the eager policy by default */
															
 
																+	/* If no policy was specified, we use the lws policy by default */
															
 
																 	return &_starpu_sched_lws_policy;
															
 
																 }
															
@@ -1153,25 +1153,6 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
																 	}
															
 
																 }
															
 
																-void _starpu_wait_on_sched_event(void)
															
 
																-{
															
 
																-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																-
															
 
																-	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
															
 
																-
															
 
																-	if (_starpu_machine_is_running())
															
 
																-	{
															
 
																-#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																-		STARPU_PTHREAD_COND_WAIT(&worker->sched_cond,
															
 
																-					  &worker->sched_mutex);
															
 
																-#endif
															
 
																-	}
															
 
																-
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																-}
															
 
																-
															
 
																 int starpu_push_local_task(int workerid, struct starpu_task *task, int back STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -63,8 +63,6 @@ struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx);
 
																 void _starpu_sched_post_exec_hook(struct starpu_task *task);
															
 
																 int _starpu_pop_task_end(struct starpu_task *task);
															
 
																-void _starpu_wait_on_sched_event(void);
															
 
																-
															
 
																 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
															
 
																 						   unsigned int node) STARPU_ATTRIBUTE_MALLOC;
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
																 	/* Do not start performance counter collection by default */
															
 
																 	conf->start_perf_counter_collection = 0;
															
 
																+
															
 
																+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
															
 
																 	return 0;
															
 
																 }
															
@@ -1531,6 +1533,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
																 		_STARPU_DISP("Warning: STARPU_ENABLE_STATS is enabled, which slows down a bit\n");
															
 
																 	}
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	if (starpu_get_env_number_default("STARPU_SIMGRID", 0))
															
 
																+	{
															
 
																+		_STARPU_DISP("Simulation mode requested, but this libstarpu was built without simgrid support, please recompile\n");
															
 
																+		return -EINVAL;
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																 #if defined(_WIN32) && !defined(__CYGWIN__)
															
 
																 	WSADATA wsadata;
															
 
																 	WSAStartup(MAKEWORD(1,0), &wsadata);
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -179,7 +179,6 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
																 	/* the data is present now */
															
 
																 	unsigned requesting_node = requesting_replicate->memory_node;
															
 
																-	requesting_replicate->requested &= ~(1UL << requesting_node);
															
 
																 	if (mode & STARPU_W)
															
 
																 	{
															
@@ -406,16 +405,18 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
																 /* handle->lock should be taken. r is returned locked. The node parameter
															
 
																  * indicate either the source of the request, or the destination for a
															
 
																  * write-only request. */
															
 
																-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch)
															
 
																+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, struct starpu_task *task, enum starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	struct _starpu_data_request *r;
															
 
																-	r = replicate->request[node];
															
 
																-
															
 
																-	if (r)
															
 
																+	for (r = replicate->request[node]; r; r = r->next_same_req)
															
 
																 	{
															
 
																 		_starpu_spin_checklocked(&r->handle->header_lock);
															
 
																+		if (task && r->task && task != r->task)
															
 
																+			/* Do not collapse requests for different tasks */
															
 
																+			continue;
															
 
																+
															
 
																 		_starpu_spin_lock(&r->lock);
															
 
																                 /* perhaps we need to "upgrade" the request */
															
@@ -440,9 +441,12 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
																 		if (mode & STARPU_W)
															
 
																 			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
															
 
																+
															
 
																+		/* We collapse with this request */
															
 
																+		return r;
															
 
																 	}
															
 
																-	return r;
															
 
																+	return NULL;
															
 
																 }
															
@@ -469,7 +473,9 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
																 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
															
 
																 								  struct _starpu_data_replicate *dst_replicate,
															
 
																-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
															
 
																+								  enum starpu_data_access_mode mode,
															
 
																+								  struct starpu_task *task,
															
 
																+								  enum starpu_is_prefetch is_prefetch,
															
 
																 								  unsigned async,
															
 
																 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
															
 
																 {
															
@@ -493,8 +499,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		unsigned nnodes = starpu_memory_nodes_get_count();
															
 
																 		for (i = 0; i < nnodes; i++)
															
 
																 			for (j = 0; j < nnodes; j++)
															
 
																-				if (handle->per_node[i].request[j])
															
 
																+			{
															
 
																+				struct _starpu_data_request *r;
															
 
																+				for (r = handle->per_node[i].request[j]; r; r = r->next_same_req)
															
 
																 					nwait++;
															
 
																+			}
															
 
																 		/* If the request is not detached (i.e. the caller really wants
															
 
																 		 * proper ownership), no new requests will appear because a
															
 
																 		 * reference will be kept on the dst replicate, which will
															
@@ -531,6 +540,25 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
															
 
																 			}
															
 
																+
															
 
																+			if (task)
															
 
																+			{
															
 
																+				unsigned j;
															
 
																+				unsigned nnodes = starpu_memory_nodes_get_count();
															
 
																+				/* Cancel any existing (prefetch) request */
															
 
																+				struct _starpu_data_request *r2;
															
 
																+				for (j = 0; j < nnodes; j++)
															
 
																+				{
															
 
																+					for (r2 = dst_replicate->request[j]; r2; r2 = r2->next_same_req)
															
 
																+					{
															
 
																+						if (r2->task && r2->task == task)
															
 
																+						{
															
 
																+							r2->canceled = 1;
															
 
																+							break;
															
 
																+						}
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																 		}
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
@@ -568,11 +596,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
															
 
																 		if (mode & STARPU_W)
															
 
																 			dst_replicate->initialized = 1;
															
 
																-		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
															
 
																+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
															
 
																+			&& !_starpu_malloc_willpin_on_node(requesting_node))
															
 
																 		{
															
 
																-			/* And this is the main RAM, really no need for a
															
 
																-			 * request, just allocate */
															
 
																-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
															
 
																+			/* And this is the main RAM without pinning, really no need for a
															
 
																+			 * request, just quickly allocate and be done */
															
 
																+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
															
 
																 			{
															
 
																 				_starpu_update_data_state(handle, dst_replicate, mode);
															
 
																 				if (dst_replicate->mc)
															
@@ -629,9 +658,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
															
 
																 		/* Try to reuse a request if possible */
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
															
 
																+#endif
															
 
																 		r = _starpu_search_existing_data_request(hop_dst_replicate,
															
 
																 				(mode & STARPU_R)?hop_src_node:hop_dst_node,
															
 
																-							 mode, is_prefetch);
															
 
																+							 mode, task, is_prefetch);
															
 
																 		reused_requests[hop] = !!r;
															
@@ -640,7 +672,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 			/* Create a new request if there was no request to reuse */
															
 
																 			r = _starpu_create_data_request(handle, hop_src_replicate,
															
 
																 							hop_dst_replicate, hop_handling_node,
															
 
																-							mode, ndeps, is_prefetch, prio, 0, origin);
															
 
																+							mode, ndeps, task, is_prefetch, prio, 0, origin);
															
 
																 			nwait++;
															
 
																 		}
															
@@ -686,7 +718,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		 */
															
 
																 		struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
															
 
																 							dst_replicate, requesting_node,
															
 
																-							STARPU_W, nwait, is_prefetch, prio, 1, origin);
															
 
																+							STARPU_W, nwait, task, is_prefetch, prio, 1, origin);
															
 
																 		/* and perform the callback after termination */
															
 
																 		_starpu_data_request_append_callback(r, callback_func, callback_arg);
															
@@ -701,8 +733,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		for (i = 0; i < nnodes; i++)
															
 
																 			for (j = 0; j < nnodes; j++)
															
 
																 			{
															
 
																-				struct _starpu_data_request *r2 = handle->per_node[i].request[j];
															
 
																-				if (r2)
															
 
																+				struct _starpu_data_request *r2;
															
 
																+				for (r2 = handle->per_node[i].request[j]; r2; r2 = r2->next_same_req)
															
 
																 				{
															
 
																 					_starpu_spin_lock(&r2->lock);
															
 
																 					if (is_prefetch < r2->prefetch)
															
@@ -736,7 +768,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 }
															
 
																 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
															
 
																-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
															
 
																+			       enum starpu_data_access_mode mode, unsigned detached,
															
 
																+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
															
 
																 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
															
 
																 {
															
 
																         _STARPU_LOG_IN();
															
@@ -745,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 	{
															
 
																 		cpt++;
															
 
																-		_starpu_datawizard_progress(1);
															
 
																+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
															
 
																 	}
															
 
																 	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
@@ -790,7 +823,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 	struct _starpu_data_request *r;
															
 
																 	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
															
 
																-						 is_prefetch, async, callback_func, callback_arg, prio, origin);
															
 
																+						 task, is_prefetch, async, callback_func, callback_arg, prio, origin);
															
 
																 	/* If no request was created, the handle was already up-to-date on the
															
 
																 	 * node. In this case, _starpu_create_request_to_fetch_data has already
															
@@ -805,24 +838,24 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																         return ret;
															
 
																 }
															
 
																-static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
															
 
																 }
															
 
																-static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
															
 
																 }
															
 
																-static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
															
 
																 }
															
 
																-static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, task, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
															
 
																 }
															
 
																 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
															
@@ -861,8 +894,15 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
																 void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
															
 
																 {
															
 
																 	uint32_t wt_mask;
															
 
																+	size_t max_wt_mask = sizeof(wt_mask) * 8;
															
 
																+	unsigned wt_count = starpu_memory_nodes_get_count();
															
 
																+	if (max_wt_mask > STARPU_MAXNODES)
															
 
																+		max_wt_mask = STARPU_MAXNODES;
															
 
																+	if (wt_count > max_wt_mask)
															
 
																+		wt_count = max_wt_mask;
															
 
																+
															
 
																 	wt_mask = default_wt_mask | handle->wt_mask;
															
 
																-	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
															
 
																+	wt_mask &= (1ULL<<max_wt_mask)-1;
															
 
																 	/* Note that it is possible that there is no valid copy of the data (if
															
 
																 	 * starpu_data_invalidate was called for instance). In that case, we do
															
@@ -871,14 +911,14 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
																 	unsigned memory_node = replicate->memory_node;
															
 
																 	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
															
 
																-	if (wt_mask & ~(1<<memory_node))
															
 
																+	if (wt_mask && (memory_node >= max_wt_mask || wt_mask & ~(1<<memory_node)))
															
 
																 		_starpu_write_through_data(handle, memory_node, wt_mask);
															
 
																 	int cpt = 0;
															
 
																 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 	{
															
 
																 		cpt++;
															
 
																-		_starpu_datawizard_progress(1);
															
 
																+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
															
 
																 	}
															
 
																 	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
@@ -897,26 +937,6 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
 
																-static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
															
 
																-{
															
 
																-	int cpt = 0;
															
 
																-	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																-	{
															
 
																-		cpt++;
															
 
																-		_starpu_datawizard_progress(1);
															
 
																-	}
															
 
																-	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																-		_starpu_spin_lock(&handle->header_lock);
															
 
																-
															
 
																-	if (replicate->state == STARPU_INVALID)
															
 
																-	{
															
 
																-		unsigned dst_node = replicate->memory_node;
															
 
																-		replicate->requested |= 1UL << dst_node;
															
 
																-	}
															
 
																-
															
 
																-	_starpu_spin_unlock(&handle->header_lock);
															
 
																-}
															
 
																-
															
 
																 int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 #ifdef STARPU_OPENMP
															
@@ -945,12 +965,9 @@ int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node,
 
																 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
															
 
																 		if (prefetch == STARPU_PREFETCH)
															
 
																-		{
															
 
																-			task_prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																-			_starpu_set_data_requested_flag_if_needed(handle, replicate);
															
 
																-		}
															
 
																+			task_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
															
 
																 		else
															
 
																-			idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																+			idle_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
															
 
																 	}
															
 
																 	if (prefetch == STARPU_PREFETCH)
															
@@ -1117,8 +1134,8 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
																 		if (async)
															
 
																 		{
															
 
																-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
															
 
																-					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
															
 
																+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1,
															
 
																+					_starpu_fetch_task_input_cb, worker, task->priority, "_starpu_fetch_task_input");
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 			if (_starpu_simgrid_fetching_input_cost())
															
 
																 				starpu_sleep(0.000001);
															
@@ -1133,7 +1150,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																-			ret = fetch_data(handle, node, local_replicate, mode, 0);
															
 
																+			ret = fetch_data(handle, node, local_replicate, mode, task, task->priority);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 			if (_starpu_simgrid_fetching_input_cost())
															
 
																 				starpu_sleep(0.000001);
															
@@ -1371,7 +1388,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
																 		local_replicate = get_replicate(handle, mode, -1, node);
															
 
																-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
															
 
																+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
															
 
																 	}
															
 
																 	if (profiling && task->profiling_info)
															
@@ -1421,7 +1438,7 @@ unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
 
																 		for (i = 0; i < nnodes; i++)
															
 
																 		{
															
 
																-			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
															
 
																+			if (handle->per_node[node].request[i])
															
 
																 				ret = 1;
															
 
																 		}
															
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -72,15 +72,13 @@ struct _starpu_data_replicate
 
																 	 * */
															
 
																 	unsigned automatically_allocated:1;
															
 
																-	/** To help the scheduling policies to make some decision, we
															
 
																-	   may keep a track of the tasks that are likely to request
															
 
																-	   this data on the current node.
															
 
																-	   It is the responsability of the scheduling _policy_ to set that
															
 
																-	   flag when it assigns a task to a queue, policies which do not
															
 
																-	   use this hint can simply ignore it.
															
 
																-	 */
															
 
																-	uint32_t requested;
															
 
																+	/** This tracks the list of requests to provide the value */
															
 
																 	struct _starpu_data_request *request[STARPU_MAXNODES];
															
 
																+	/** This points to the last entry of request, to easily append to the list */
															
 
																+	struct _starpu_data_request *last_request[STARPU_MAXNODES];
															
 
																+
															
 
																+	/* Which request is loading data here */
															
 
																+	struct _starpu_data_request *load_request;
															
 
																 	/** The number of prefetches that we made for this replicate for various tasks
															
 
																 	 * This is also the number of tasks that we will wait to see use the mc before
															
@@ -322,7 +320,8 @@ struct _starpu_data_state
 
																  * async means that _starpu_fetch_data_on_node will wait for completion of the request
															
 
																  */
															
 
																 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
															
 
																-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
															
 
																+			       enum starpu_data_access_mode mode, unsigned detached,
															
 
																+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
															
 
																 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
															
 
																 /** This releases a reference on the handle */
															
 
																 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
															
@@ -369,7 +368,8 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
																  */
															
 
																 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
															
 
																 								  struct _starpu_data_replicate *dst_replicate,
															
 
																-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
															
 
																+								  enum starpu_data_access_mode mode,
															
 
																+								  struct starpu_task *task, enum starpu_is_prefetch is_prefetch,
															
 
																 								  unsigned async,
															
 
																 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
																 									struct _starpu_data_replicate *dst_replicate,
															
 
																 									unsigned donotread,
															
 
																 									struct _starpu_data_request *req,
															
 
																-									unsigned may_alloc,
															
 
																+									enum _starpu_may_alloc may_alloc,
															
 
																 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	if (!donotread)
															
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
																 	/* first make sure the destination has an allocated buffer */
															
 
																 	if (!dst_replicate->allocated)
															
 
																 	{
															
 
																-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
															
 
																+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
															
 
																 			/* We're not supposed to allocate there at the moment */
															
 
																 			return -ENOMEM;
															
 
																-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
															
 
																+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
															
 
																 		if (ret_alloc)
															
 
																 			return -ENOMEM;
															
 
																 	}
															
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,13 @@ extern "C"
 
																 struct _starpu_data_request;
															
 
																 struct _starpu_data_replicate;
															
 
																+enum _starpu_may_alloc
															
 
																+{
															
 
																+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
															
 
																+	STARPU_DATAWIZARD_DO_ALLOC,
															
 
																+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
															
 
																+};
															
 
																+
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 /** MIC needs memory_node to know which MIC is concerned.
															
 
																  * mark is used to wait asynchronous request.
															
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
																 				    struct _starpu_data_replicate *dst_replicate,
															
 
																 				    unsigned donotread,
															
 
																 				    struct _starpu_data_request *req,
															
 
																-				    unsigned may_alloc,
															
 
																+				    enum _starpu_may_alloc may_alloc,
															
 
																 				    enum starpu_is_prefetch prefetch);
															
 
																 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,57 +25,67 @@
 
																 #include <core/simgrid.h>
															
 
																 /* requests that have not been treated at all */
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
															
 
																-#endif
															
 
																-static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
															
 
																-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
															
 
																-static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
															
 
																-static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
															
 
																+static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES][STARPU_MAXNODES][2]; /* Contains both task_prefetch and prefetch */
															
 
																+static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																+static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																 /* requests that are not terminated (eg. async transfers) */
															
 
																-static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES];
															
 
																-static unsigned data_requests_npending[STARPU_MAXNODES];
															
 
																-static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
															
 
																+static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																+static unsigned data_requests_npending[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																+static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
															
 
																 void _starpu_init_data_request_lists(void)
															
 
																 {
															
 
																-	unsigned i;
															
 
																+	unsigned i, j;
															
 
																+	enum _starpu_data_request_inout k;
															
 
																 	for (i = 0; i < STARPU_MAXNODES; i++)
															
 
																 	{
															
 
																-		_starpu_data_request_prio_list_init(&data_requests[i]);
															
 
																-		_starpu_data_request_prio_list_init(&prefetch_requests[i]);
															
 
																-		_starpu_data_request_prio_list_init(&idle_requests[i]);
															
 
																+		for (j = 0; j < STARPU_MAXNODES; j++)
															
 
																+		{
															
 
																+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
															
 
																+			{
															
 
																+				_starpu_data_request_prio_list_init(&data_requests[i][j][k]);
															
 
																+				_starpu_data_request_prio_list_init(&prefetch_requests[i][j][k]);
															
 
																+				_starpu_data_request_prio_list_init(&idle_requests[i][j][k]);
															
 
																 #ifndef STARPU_DEBUG
															
 
																-		/* Tell helgrind that we are fine with checking for list_empty
															
 
																-		 * in _starpu_handle_node_data_requests, we will call it
															
 
																-		 * periodically anyway */
															
 
																-		STARPU_HG_DISABLE_CHECKING(data_requests[i].tree.root);
															
 
																-		STARPU_HG_DISABLE_CHECKING(prefetch_requests[i].tree.root);
															
 
																-		STARPU_HG_DISABLE_CHECKING(idle_requests[i].tree.root);
															
 
																+				/* Tell helgrind that we are fine with checking for list_empty
															
 
																+				 * in _starpu_handle_node_data_requests, we will call it
															
 
																+				 * periodically anyway */
															
 
																+				STARPU_HG_DISABLE_CHECKING(data_requests[i][j][k].tree.root);
															
 
																+				STARPU_HG_DISABLE_CHECKING(prefetch_requests[i][j][k].tree.root);
															
 
																+				STARPU_HG_DISABLE_CHECKING(idle_requests[i][j][k].tree.root);
															
 
																 #endif
															
 
																+				_starpu_data_request_prio_list_init(&data_requests_pending[i][j][k]);
															
 
																+				data_requests_npending[i][j][k] = 0;
															
 
																-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
															
 
																-
															
 
																-		_starpu_data_request_prio_list_init(&data_requests_pending[i]);
															
 
																-		data_requests_npending[i] = 0;
															
 
																-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
															
 
																+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i][j][k], NULL);
															
 
																+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i][j][k], NULL);
															
 
																+			}
															
 
																+		}
															
 
																 	}
															
 
																 	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
															
 
																 }
															
 
																 void _starpu_deinit_data_request_lists(void)
															
 
																 {
															
 
																-	unsigned i;
															
 
																+	unsigned i, j;
															
 
																+	enum _starpu_data_request_inout k;
															
 
																 	for (i = 0; i < STARPU_MAXNODES; i++)
															
 
																 	{
															
 
																-		_starpu_data_request_prio_list_deinit(&data_requests[i]);
															
 
																-		_starpu_data_request_prio_list_deinit(&prefetch_requests[i]);
															
 
																-		_starpu_data_request_prio_list_deinit(&idle_requests[i]);
															
 
																-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
															
 
																-		_starpu_data_request_prio_list_deinit(&data_requests_pending[i]);
															
 
																-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
															
 
																+		for (j = 0; j < STARPU_MAXNODES; j++)
															
 
																+		{
															
 
																+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
															
 
																+			{
															
 
																+				_starpu_data_request_prio_list_deinit(&data_requests[i][j][k]);
															
 
																+				_starpu_data_request_prio_list_deinit(&prefetch_requests[i][j][k]);
															
 
																+				_starpu_data_request_prio_list_deinit(&idle_requests[i][j][k]);
															
 
																+				_starpu_data_request_prio_list_deinit(&data_requests_pending[i][j][k]);
															
 
																+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i][j][k]);
															
 
																+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i][j][k]);
															
 
																+			}
															
 
																+		}
															
 
																 	}
															
 
																 }
															
@@ -92,23 +102,39 @@ static void _starpu_data_request_unlink(struct _starpu_data_request *r)
 
																 		STARPU_ASSERT(r->mode == STARPU_W);
															
 
																 		r->handle->write_invalidation_req = NULL;
															
 
																 	}
															
 
																-	else if (r->mode & STARPU_R)
															
 
																-	{
															
 
																-		/* If this is a read request, we store the pending requests
															
 
																-		 * between src and dst. */
															
 
																-		unsigned node = r->src_replicate->memory_node;
															
 
																-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
															
 
																-		r->dst_replicate->request[node] = NULL;
															
 
																-	}
															
 
																 	else
															
 
																 	{
															
 
																-		/* If this is a write only request, then there is no source and
															
 
																-		 * we use the destination node to cache the request. */
															
 
																-		unsigned node = r->dst_replicate->memory_node;
															
 
																-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
															
 
																-		r->dst_replicate->request[node] = NULL;
															
 
																-	}
															
 
																+		unsigned node;
															
 
																+		struct _starpu_data_request **prevp, *prev;
															
 
																+
															
 
																+		if (r->mode & STARPU_R)
															
 
																+			/* If this is a read request, we store the pending requests
															
 
																+			 * between src and dst. */
															
 
																+			node = r->src_replicate->memory_node;
															
 
																+		else
															
 
																+			/* If this is a write only request, then there is no source and
															
 
																+			 * we use the destination node to cache the request. */
															
 
																+			node = r->dst_replicate->memory_node;
															
 
																+
															
 
																+		/* Look for ourself in the list, we should be not very far. */
															
 
																+		for (prevp = &r->dst_replicate->request[node], prev = NULL;
															
 
																+		     *prevp && *prevp != r;
															
 
																+		     prev = *prevp, prevp = &prev->next_same_req)
															
 
																+			;
															
 
																+		STARPU_ASSERT(*prevp == r);
															
 
																+		*prevp = r->next_same_req;
															
 
																+
															
 
																+		if (!r->next_same_req)
															
 
																+		{
															
 
																+			/* I was last */
															
 
																+			STARPU_ASSERT(r->dst_replicate->last_request[node] == r);
															
 
																+			if (prev)
															
 
																+				r->dst_replicate->last_request[node] = prev;
															
 
																+			else
															
 
																+				r->dst_replicate->last_request[node] = NULL;
															
 
																+		}
															
 
																+	}
															
 
																 }
															
 
																 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
															
@@ -124,6 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 							 int handling_node,
															
 
																 							 enum starpu_data_access_mode mode,
															
 
																 							 unsigned ndeps,
															
 
																+							 struct starpu_task *task,
															
 
																 							 enum starpu_is_prefetch is_prefetch,
															
 
																 							 int prio,
															
 
																 							 unsigned is_write_invalidation,
															
@@ -135,7 +162,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 	_starpu_spin_init(&r->lock);
															
 
																-	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch);
															
 
																+	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch, r);
															
 
																 	r->origin = origin;
															
 
																 	r->handle = handle;
															
@@ -153,22 +180,48 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 	if (handling_node == -1)
															
 
																 		handling_node = STARPU_MAIN_RAM;
															
 
																 	r->handling_node = handling_node;
															
 
																+	if (is_write_invalidation)
															
 
																+	{
															
 
																+		r->peer_node = handling_node;
															
 
																+		r->inout = _STARPU_DATA_REQUEST_IN;
															
 
																+	}
															
 
																+	else if (dst_replicate->memory_node == handling_node)
															
 
																+	{
															
 
																+		if (src_replicate)
															
 
																+			r->peer_node = src_replicate->memory_node;
															
 
																+		else
															
 
																+			r->peer_node = handling_node;
															
 
																+		r->inout = _STARPU_DATA_REQUEST_IN;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		r->peer_node = dst_replicate->memory_node;
															
 
																+		r->inout = _STARPU_DATA_REQUEST_OUT;
															
 
																+	}
															
 
																 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																 	r->completed = 0;
															
 
																+	r->added_ref = 0;
															
 
																+	r->canceled = 0;
															
 
																 	r->prefetch = is_prefetch;
															
 
																+	r->task = task;
															
 
																 	r->nb_tasks_prefetch = 0;
															
 
																 	r->prio = prio;
															
 
																 	r->retval = -1;
															
 
																 	r->ndeps = ndeps;
															
 
																+	r->next_same_req = NULL;
															
 
																 	r->next_req_count = 0;
															
 
																 	r->callbacks = NULL;
															
 
																 	r->com_id = 0;
															
 
																 	_starpu_spin_lock(&r->lock);
															
 
																-	/* Take a reference on the target for the request to be able to write it */
															
 
																-	if (dst_replicate)
															
 
																+	/* For a fetch, take a reference as soon as now on the target, to avoid
															
 
																+	 * replicate eviction */
															
 
																+	if (is_prefetch == STARPU_FETCH && dst_replicate)
															
 
																+	{
															
 
																+		r->added_ref = 1;
															
 
																 		dst_replicate->refcnt++;
															
 
																+	}
															
 
																 	handle->busy_count++;
															
 
																 	if (is_write_invalidation)
															
@@ -176,20 +229,28 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 		STARPU_ASSERT(!handle->write_invalidation_req);
															
 
																 		handle->write_invalidation_req = r;
															
 
																 	}
															
 
																-	else if (mode & STARPU_R)
															
 
																-	{
															
 
																-		unsigned src_node = src_replicate->memory_node;
															
 
																-		STARPU_ASSERT(!dst_replicate->request[src_node]);
															
 
																-		dst_replicate->request[src_node] = r;
															
 
																-		/* Take a reference on the source for the request to be able to read it */
															
 
																-		src_replicate->refcnt++;
															
 
																-		handle->busy_count++;
															
 
																-	}
															
 
																 	else
															
 
																 	{
															
 
																-		unsigned dst_node = dst_replicate->memory_node;
															
 
																-		STARPU_ASSERT(!dst_replicate->request[dst_node]);
															
 
																-		dst_replicate->request[dst_node] = r;
															
 
																+		unsigned node;
															
 
																+
															
 
																+		if (mode & STARPU_R)
															
 
																+			node = src_replicate->memory_node;
															
 
																+		else
															
 
																+			node = dst_replicate->memory_node;
															
 
																+
															
 
																+		if (!dst_replicate->request[node])
															
 
																+			dst_replicate->request[node] = r;
															
 
																+		else
															
 
																+			dst_replicate->last_request[node]->next_same_req = r;
															
 
																+		dst_replicate->last_request[node] = r;
															
 
																+
															
 
																+		if (mode & STARPU_R)
															
 
																+		{
															
 
																+			/* Take a reference on the source for the request to be
															
 
																+			 * able to read it */
															
 
																+			src_replicate->refcnt++;
															
 
																+			handle->busy_count++;
															
 
																+		}
															
 
																 	}
															
 
																 	r->refcnt = 1;
															
@@ -199,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 	return r;
															
 
																 }
															
 
																-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
															
 
																+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
															
 
																 {
															
 
																 	int retval;
															
 
																 	int do_delete = 0;
															
@@ -310,14 +371,14 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
																 	}
															
 
																 	/* insert the request in the proper list */
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
															
 
																 	if (r->prefetch >= STARPU_IDLEFETCH)
															
 
																-		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
															
 
																+		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node][r->peer_node][r->inout], r);
															
 
																 	else if (r->prefetch > STARPU_FETCH)
															
 
																-		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
															
 
																+		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
															
 
																 	else
															
 
																-		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
															
 
																+		_starpu_data_request_prio_list_push_back(&data_requests[handling_node][r->peer_node][r->inout], r);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
															
 
																 #ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	_starpu_wake_all_blocked_workers_on_node(handling_node);
															
@@ -352,7 +413,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
															
 
																-	if (dst_replicate)
															
 
																+	if (r->canceled < 2 && dst_replicate)
															
 
																 	{
															
 
																 #ifdef STARPU_MEMORY_STATS
															
 
																 		enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
															
@@ -360,6 +421,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 		_starpu_spin_checklocked(&handle->header_lock);
															
 
																 		_starpu_update_data_state(handle, r->dst_replicate, mode);
															
 
																+		dst_replicate->load_request = NULL;
															
 
																 #ifdef STARPU_MEMORY_STATS
															
 
																 		if (src_replicate->state == STARPU_INVALID)
															
@@ -382,7 +444,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 #endif
															
 
																 	}
															
 
																-	if (r->com_id > 0)
															
 
																+	if (r->canceled < 2 && r->com_id > 0)
															
 
																 	{
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 		unsigned src_node = src_replicate->memory_node;
															
@@ -414,12 +476,15 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 	/* Remove a reference on the destination replicate for the request */
															
 
																 	if (dst_replicate)
															
 
																 	{
															
 
																-		if (dst_replicate->mc)
															
 
																+		if (r->canceled < 2 && dst_replicate->mc)
															
 
																 			/* Make sure it stays there for the task.  */
															
 
																 			dst_replicate->nb_tasks_prefetch += r->nb_tasks_prefetch;
															
 
																-		STARPU_ASSERT(dst_replicate->refcnt > 0);
															
 
																-		dst_replicate->refcnt--;
															
 
																+		if (r->added_ref)
															
 
																+		{
															
 
																+			STARPU_ASSERT(dst_replicate->refcnt > 0);
															
 
																+			dst_replicate->refcnt--;
															
 
																+		}
															
 
																 	}
															
 
																 	STARPU_ASSERT(handle->busy_count > 0);
															
 
																 	handle->busy_count--;
															
@@ -467,8 +532,16 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 	}
															
 
																 }
															
 
																+void _starpu_data_request_complete_wait(void *arg)
															
 
																+{
															
 
																+	struct _starpu_data_request *r = arg;
															
 
																+	_starpu_spin_lock(&r->handle->header_lock);
															
 
																+	_starpu_spin_lock(&r->lock);
															
 
																+	starpu_handle_data_request_completion(r);
															
 
																+}
															
 
																+
															
 
																 /* TODO : accounting to see how much time was spent working for other people ... */
															
 
																-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum starpu_is_prefetch prefetch)
															
 
																+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
															
 
																 {
															
 
																 	starpu_data_handle_t handle = r->handle;
															
@@ -491,12 +564,50 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
															
 
																 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
															
 
																+	if (r->canceled)
															
 
																+	{
															
 
																+		/* Ok, canceled before starting copies etc. */
															
 
																+		r->canceled = 2;
															
 
																+		/* Nothing left to do */
															
 
																+		starpu_handle_data_request_completion(r);
															
 
																+		return 0;
															
 
																+	}
															
 
																+
															
 
																+	if (dst_replicate)
															
 
																+	{
															
 
																+		struct _starpu_data_request *r2 = dst_replicate->load_request;
															
 
																+		if (r2 && r2 != r)
															
 
																+		{
															
 
																+			/* Oh, some other transfer is already loading the value. Just wait for it */
															
 
																+			r->canceled = 2;
															
 
																+			_starpu_spin_unlock(&r->lock);
															
 
																+			_starpu_spin_lock(&r2->lock);
															
 
																+			_starpu_data_request_append_callback(r2, _starpu_data_request_complete_wait, r);
															
 
																+			_starpu_spin_unlock(&r2->lock);
															
 
																+			_starpu_spin_unlock(&handle->header_lock);
															
 
																+			return 0;
															
 
																+		}
															
 
																+
															
 
																+		/* We are loading this replicate.
															
 
																+		 * Note: we might fail to allocate memory, but we will keep on and others will wait for us. */
															
 
																+		dst_replicate->load_request = r;
															
 
																+	}
															
 
																+
															
 
																 	enum starpu_data_access_mode r_mode = r->mode;
															
 
																 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
															
 
																 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
															
 
																 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
															
 
																+	/* For prefetches, we take a reference on the destination only now that
															
 
																+	 * we will really try to fetch the data (instead of in
															
 
																+	 * _starpu_create_data_request) */
															
 
																+	if (dst_replicate && r->prefetch > STARPU_FETCH)
															
 
																+	{
															
 
																+		r->added_ref = 1;	/* Note: we might get upgraded while trying to allocate */
															
 
																+		dst_replicate->refcnt++;
															
 
																+	}
															
 
																+
															
 
																 	_starpu_spin_unlock(&r->lock);
															
 
																 	/* FIXME: the request may get upgraded from here to freeing it... */
															
@@ -507,7 +618,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	if (dst_replicate && dst_replicate->state == STARPU_INVALID)
															
 
																 		r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
															
 
																-						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, prefetch);
															
 
																+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, r->prefetch);
															
 
																 	else
															
 
																 		/* Already valid actually, no need to transfer anything */
															
 
																 		r->retval = 0;
															
@@ -516,6 +627,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	{
															
 
																 		/* If there was not enough memory, we will try to redo the
															
 
																 		 * request later. */
															
 
																+
															
 
																+		if (r->prefetch > STARPU_FETCH)
															
 
																+		{
															
 
																+			STARPU_ASSERT(r->added_ref);
															
 
																+			/* Drop ref until next try */
															
 
																+			r->added_ref = 0;
															
 
																+			dst_replicate->refcnt--;
															
 
																+		}
															
 
																+
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		return -ENOMEM;
															
 
																 	}
															
@@ -528,10 +648,10 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 		 * requests in the meantime. */
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
															
 
																-		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node], r);
															
 
																-		data_requests_npending[r->handling_node]++;
															
 
																-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
															
 
																+		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node][r->peer_node][r->inout], r);
															
 
																+		data_requests_npending[r->handling_node][r->peer_node][r->inout]++;
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
															
 
																 		return -EAGAIN;
															
 
																 	}
															
@@ -543,10 +663,9 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	return 0;
															
 
																 }
															
 
																-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
															
 
																+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 	struct _starpu_data_request *r;
															
 
																-	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
															
 
																 	unsigned i;
															
 
																 	int ret = 0;
															
@@ -556,48 +675,55 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 	/* This is racy, but not posing problems actually, since we know we
															
 
																 	 * will come back here to probe again regularly anyway.
															
 
																 	 * Thus, do not expose this optimization to helgrind */
															
 
																-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[src_node]))
															
 
																+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
															
 
																 		return 0;
															
 
																 #endif
															
 
																-	/* TODO optimize */
															
 
																+	/* We create a new list to pickup some requests from the main list, and
															
 
																+	 * we handle the request(s) one by one from it, without concurrency issues.
															
 
																+	 */
															
 
																+	struct _starpu_data_request_list local_list, remain_list;
															
 
																+	_starpu_data_request_list_init(&local_list);
															
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	/* take all the entries from the request list */
															
 
																-	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
															
 
																+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]))
															
 
																 	{
															
 
																 		/* List is busy, do not bother with it */
															
 
																 		return -EBUSY;
															
 
																 	}
															
 
																 #else
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
															
 
																 #endif
															
 
																-	if (_starpu_data_request_prio_list_empty(&reqlist[src_node]))
															
 
																+	for (i = data_requests_npending[handling_node][peer_node][inout];
															
 
																+		i < n && ! _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]);
															
 
																+		i++)
															
 
																 	{
															
 
																-		/* there is no request */
															
 
																-                STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
 
																-		return 0;
															
 
																+		r = _starpu_data_request_prio_list_pop_front_highest(&reqlist[handling_node][peer_node][inout]);
															
 
																+		_starpu_data_request_list_push_back(&local_list, r);
															
 
																 	}
															
 
																-	/* There is an entry: we create a new empty list to replace the list of
															
 
																-	 * requests, and we handle the request(s) one by one in the former
															
 
																-	 * list, without concurrency issues.*/
															
 
																-	struct _starpu_data_request_prio_list local_list = reqlist[src_node];
															
 
																-	_starpu_data_request_prio_list_init(&reqlist[src_node]);
															
 
																+	if (!_starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
															
 
																+		/* We have left some requests */
															
 
																+		ret = -EBUSY;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
 
																+	if (_starpu_data_request_list_empty(&local_list))
															
 
																+		/* there is no request */
															
 
																+		return 0;
															
 
																-	for (i = 0; i <= prefetch; i++)
															
 
																-		_starpu_data_request_prio_list_init(&new_data_requests[i]);
															
 
																+	/* This will contain the remaining requests */
															
 
																+	_starpu_data_request_list_init(&remain_list);
															
 
																 	double start = starpu_timing_now();
															
 
																 	/* for all entries of the list */
															
 
																-	while (!_starpu_data_request_prio_list_empty(&local_list))
															
 
																+	while (!_starpu_data_request_list_empty(&local_list))
															
 
																 	{
															
 
																                 int res;
															
 
																-		if (data_requests_npending[src_node] >= n)
															
 
																+		if (data_requests_npending[handling_node][peer_node][inout] >= n)
															
 
																 		{
															
 
																 			/* Too many requests at the same time, skip pushing
															
 
																 			 * more for now */
															
@@ -605,21 +731,22 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 			break;
															
 
																 		}
															
 
																-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
															
 
																+		r = _starpu_data_request_list_pop_front(&local_list);
															
 
																-		res = starpu_handle_data_request(r, may_alloc, prefetch);
															
 
																+		res = starpu_handle_data_request(r, may_alloc);
															
 
																 		if (res != 0 && res != -EAGAIN)
															
 
																 		{
															
 
																 			/* handle is busy, or not enough memory, postpone for now */
															
 
																 			ret = res;
															
 
																 			/* Prefetch requests might have gotten promoted while in tmp list */
															
 
																-			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
															
 
																+			_starpu_data_request_list_push_back(&remain_list, r);
															
 
																 			if (prefetch > STARPU_FETCH)
															
 
																 				/* Prefetching more there would make the situation even worse */
															
 
																 				break;
															
 
																 		}
															
 
																+		else
															
 
																+			(*pushed)++;
															
 
																-		(*pushed)++;
															
 
																 		if (starpu_timing_now() - start >= MAX_PUSH_TIME)
															
 
																 		{
															
 
																 			/* We have spent a lot of time doing requests, skip pushing more for now */
															
@@ -628,43 +755,23 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 		}
															
 
																 	}
															
 
																-	/* Push back requests we didn't handle on the proper list */
															
 
																-	while (!_starpu_data_request_prio_list_empty(&local_list))
															
 
																-	{
															
 
																-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
															
 
																-		/* Prefetch requests might have gotten promoted while in tmp list */
															
 
																-		_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
															
 
																-	}
															
 
																-	_starpu_data_request_prio_list_deinit(&local_list);
															
 
																-
															
 
																-	for (i = 0; i <= prefetch; i++)
															
 
																-		if (!_starpu_data_request_prio_list_empty(&new_data_requests[i]))
															
 
																-			break;
															
 
																+	/* Gather remainder */
															
 
																+	_starpu_data_request_list_push_list_back(&remain_list, &local_list);
															
 
																-	if (i <= prefetch)
															
 
																+	if (!_starpu_data_request_list_empty(&remain_list))
															
 
																 	{
															
 
																-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
 
																-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
															
 
																-		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
															
 
																-			data_requests[src_node] = new_data_requests[STARPU_FETCH];
															
 
																-		}
															
 
																-		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
															
 
																+		while (!_starpu_data_request_list_empty(&remain_list))
															
 
																 		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
															
 
																-			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
															
 
																-		}
															
 
																-		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
															
 
																-		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
															
 
																-			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
															
 
																-		}
															
 
																-		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
															
 
																-		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
															
 
																-			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
															
 
																+			r = _starpu_data_request_list_pop_back(&remain_list);
															
 
																+			if (r->prefetch >= STARPU_IDLEFETCH)
															
 
																+				_starpu_data_request_prio_list_push_front(&idle_requests[handling_node][r->peer_node][r->inout], r);
															
 
																+			else if (r->prefetch > STARPU_FETCH)
															
 
																+				_starpu_data_request_prio_list_push_front(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
															
 
																+			else
															
 
																+				_starpu_data_request_prio_list_push_front(&data_requests[handling_node][r->peer_node][r->inout], r);
															
 
																 		}
															
 
																-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 		if (*pushed)
															
@@ -676,32 +783,32 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 			 * for eviction to happen.
															
 
																 			 */
															
 
																 			starpu_sleep(0.000001);
															
 
																-			_starpu_wake_all_blocked_workers_on_node(src_node);
															
 
																+			_starpu_wake_all_blocked_workers_on_node(handling_node);
															
 
																 		}
															
 
																 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)
															
 
																-		_starpu_wake_all_blocked_workers_on_node(src_node);
															
 
																+		_starpu_wake_all_blocked_workers_on_node(handling_node);
															
 
																 #endif
															
 
																 	}
															
 
																 	return ret;
															
 
																 }
															
 
																-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
															
 
																+	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
															
 
																 }
															
 
																-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
															
 
																+	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
															
 
																 }
															
 
																-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
															
 
																+	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
															
 
																 }
															
 
																-static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
															
 
																+static int _handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned force)
															
 
																 {
															
 
																 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
															
 
																 //
															
@@ -712,14 +819,14 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
																 	/* Here helgrind would should that this is an un protected access.
															
 
																 	 * We however don't care about missing an entry, we will get called
															
 
																 	 * again sooner or later. */
															
 
																-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
															
 
																+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
															
 
																 		return 0;
															
 
																 #endif
															
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	if (!force)
															
 
																 	{
															
 
																-		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]))
															
 
																+		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]))
															
 
																 		{
															
 
																 			/* List is busy, do not bother with it */
															
 
																 			return 0;
															
@@ -728,19 +835,19 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
																 	else
															
 
																 #endif
															
 
																 		/* We really want to handle requests */
															
 
																-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
															
 
																-	if (_starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
															
 
																+	if (_starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
															
 
																 	{
															
 
																 		/* there is no request */
															
 
																-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
															
 
																 		return 0;
															
 
																 	}
															
 
																 	/* for all entries of the list */
															
 
																-	struct _starpu_data_request_prio_list local_list = data_requests_pending[src_node];
															
 
																-	_starpu_data_request_prio_list_init(&data_requests_pending[src_node]);
															
 
																+	struct _starpu_data_request_prio_list local_list = data_requests_pending[handling_node][peer_node][inout];
															
 
																+	_starpu_data_request_prio_list_init(&data_requests_pending[handling_node][peer_node][inout]);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
															
 
																 	_starpu_data_request_prio_list_init(&new_data_requests_pending);
															
 
																 	taken = 0;
															
@@ -803,55 +910,75 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 
																 		}
															
 
																 	}
															
 
																 	_starpu_data_request_prio_list_deinit(&local_list);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																-	data_requests_npending[src_node] -= taken - kept;
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
															
 
																+	data_requests_npending[handling_node][peer_node][inout] -= taken - kept;
															
 
																 	if (kept)
															
 
																-		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[src_node], &new_data_requests_pending);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
															
 
																+		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[handling_node][peer_node][inout], &new_data_requests_pending);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
															
 
																 	return taken - kept;
															
 
																 }
															
 
																-int _starpu_handle_pending_node_data_requests(unsigned src_node)
															
 
																+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
															
 
																 {
															
 
																-	return _handle_pending_node_data_requests(src_node, 0);
															
 
																+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 0);
															
 
																 }
															
 
																-int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
															
 
																+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
															
 
																 {
															
 
																-	return _handle_pending_node_data_requests(src_node, 1);
															
 
																+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 1);
															
 
																 }
															
 
																 /* Note: the returned value will be outdated since the locks are not taken at
															
 
																  * entry/exit */
															
 
																-int _starpu_check_that_no_data_request_exists(unsigned node)
															
 
																+static int __starpu_check_that_no_data_request_exists(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
															
 
																 {
															
 
																 	int no_request;
															
 
																 	int no_pending;
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node]);
															
 
																-	no_request = _starpu_data_request_prio_list_empty(&data_requests[node])
															
 
																-	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node])
															
 
																-		  && _starpu_data_request_prio_list_empty(&idle_requests[node]);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
															
 
																-	no_pending = !data_requests_npending[node];
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node][peer_node][inout]);
															
 
																+	no_request = _starpu_data_request_prio_list_empty(&data_requests[node][peer_node][inout])
															
 
																+	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node][peer_node][inout])
															
 
																+		  && _starpu_data_request_prio_list_empty(&idle_requests[node][peer_node][inout]);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node][peer_node][inout]);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
															
 
																+	no_pending = !data_requests_npending[node][peer_node][inout];
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
															
 
																 	return no_request && no_pending;
															
 
																 }
															
 
																+int _starpu_check_that_no_data_request_exists(unsigned node)
															
 
																+{
															
 
																+	unsigned peer_node, nnodes = starpu_memory_nodes_get_count();
															
 
																+
															
 
																+	for (peer_node = 0; peer_node < nnodes; peer_node++)
															
 
																+		if (!__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_IN)
															
 
																+		 || !__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_OUT))
															
 
																+		 return 0;
															
 
																+	 return 1;
															
 
																+}
															
 
																+
															
 
																 /* Note: the returned value will be outdated since the locks are not taken at
															
 
																  * entry/exit */
															
 
																-int _starpu_check_that_no_data_request_is_pending(unsigned node)
															
 
																+int _starpu_check_that_no_data_request_is_pending(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
															
 
																 {
															
 
																-	return !data_requests_npending[node];
															
 
																+	return !data_requests_npending[node][peer_node][inout];
															
 
																 }
															
 
																 void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_is_prefetch prefetch)
															
 
																 {
															
 
																+	_starpu_spin_checklocked(&r->handle->header_lock);
															
 
																 	STARPU_ASSERT(r->prefetch > prefetch);
															
 
																+
															
 
																+	if (prefetch == STARPU_FETCH && !r->added_ref)
															
 
																+	{
															
 
																+		/* That would have been done by _starpu_create_data_request */
															
 
																+		r->added_ref = 1;
															
 
																+		r->dst_replicate->refcnt++;
															
 
																+	}
															
 
																+
															
 
																 	r->prefetch=prefetch;
															
 
																 	if (prefetch >= STARPU_IDLEFETCH)
															
@@ -867,27 +994,27 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_
 
																 			_starpu_update_prefetch_status(next_req, prefetch);
															
 
																 	}
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
															
 
																 	int found = 1;
															
 
																 	/* The request can be in a different list (handling request or the temp list)
															
 
																 	 * we have to check that it is really in the prefetch or idle list. */
															
 
																-	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
															
 
																-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
															
 
																-	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
															
 
																-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
															
 
																+	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r))
															
 
																+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r);
															
 
																+	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node][r->peer_node][r->inout], r))
															
 
																+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node][r->peer_node][r->inout], r);
															
 
																 	else
															
 
																 		found = 0;
															
 
																 	if (found)
															
 
																 	{
															
 
																 		if (prefetch > STARPU_FETCH)
															
 
																-			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
															
 
																+			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node][r->peer_node][r->inout],r);
															
 
																 		else
															
 
																-			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
															
 
																+			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node][r->peer_node][r->inout],r);
															
 
																 	}
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
															
 
																 #ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																 	_starpu_wake_all_blocked_workers_on_node(r->handling_node);
															
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -32,8 +32,8 @@
 
																  * Data interfaces should also have to declare how many asynchronous requests
															
 
																  * they have actually started (think of e.g. csr).
															
 
																  */
															
 
																-#define MAX_PENDING_REQUESTS_PER_NODE 20
															
 
																-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
															
 
																+#define MAX_PENDING_REQUESTS_PER_NODE 5
															
 
																+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 2
															
 
																 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
															
 
																 /** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
															
 
																 #define MAX_PUSH_TIME 1000
															
@@ -47,6 +47,11 @@ struct _starpu_callback_list
 
																 	struct _starpu_callback_list *next;
															
 
																 };
															
 
																+enum _starpu_data_request_inout
															
 
																+{
															
 
																+	_STARPU_DATA_REQUEST_IN, _STARPU_DATA_REQUEST_OUT
															
 
																+};
															
 
																+
															
 
																 /** This represents a data request, i.e. we want some data to get transferred
															
 
																  * from a source to a destination. */
															
 
																 LIST_TYPE(_starpu_data_request,
															
@@ -63,6 +68,8 @@ LIST_TYPE(_starpu_data_request,
 
																 	 * the node can make the CUDA/OpenCL calls.
															
 
																 	 */
															
 
																 	unsigned handling_node;
															
 
																+	unsigned peer_node;
															
 
																+	enum _starpu_data_request_inout inout;
															
 
																 	/*
															
 
																 	 * What the destination node wants to do with the data: write to it,
															
@@ -78,10 +85,19 @@ LIST_TYPE(_starpu_data_request,
 
																 	struct _starpu_async_channel async_channel;
															
 
																 	/** Whether the transfer is completed. */
															
 
																-	unsigned completed;
															
 
																+	unsigned completed:1;
															
 
																+
															
 
																+	/** Whether we have already added our reference to the dst replicate. */
															
 
																+	unsigned added_ref:1;
															
 
																+
															
 
																+	/** Whether the request was canceled before being handled (because the transfer already happened another way). */
															
 
																+	unsigned canceled:2;
															
 
																 	/** Whether this is just a prefetch request */
															
 
																-	enum starpu_is_prefetch prefetch;
															
 
																+	enum starpu_is_prefetch prefetch:3;
															
 
																+
															
 
																+	/** Task this request is for */
															
 
																+	struct starpu_task *task;
															
 
																 	/** Number of tasks which used this as a prefetch */
															
 
																 	unsigned nb_tasks_prefetch;
															
@@ -96,6 +112,10 @@ LIST_TYPE(_starpu_data_request,
 
																 	 * dependencies. */
															
 
																 	unsigned ndeps;
															
 
																+	/** Some further tasks may have requested prefetches for the same data
															
 
																+	 * much later on, link with them */
															
 
																+	struct _starpu_data_request *next_same_req;
															
 
																+
															
 
																 	/** in case we have a chain of request (eg. for nvidia multi-GPU), this
															
 
																 	 * is the list of requests which are waiting for this one. */
															
 
																 	struct _starpu_data_request *next_req[STARPU_MAXNODES+1];
															
@@ -123,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
																 	int prio;
															
 
																-	/** if this is more complicated ... (eg. application request) 
															
 
																+	/** if this is more complicated ... (eg. application request)
															
 
																 	 * NB: this callback is not called with the lock taken !
															
 
																 	 */
															
 
																 	void (*ready_data_callback)(void *argcb);
															
@@ -135,15 +155,15 @@ void _starpu_init_data_request_lists(void);
 
																 void _starpu_deinit_data_request_lists(void);
															
 
																 void _starpu_post_data_request(struct _starpu_data_request *r);
															
 
																 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
															
 
																-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
															
 
																-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
															
 
																-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
															
 
																+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
															
 
																+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
															
 
																+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
															
 
																-int _starpu_handle_pending_node_data_requests(unsigned src_node);
															
 
																-int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
															
 
																+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
															
 
																+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
															
 
																-int _starpu_check_that_no_data_request_exists(unsigned node);
															
 
																-int _starpu_check_that_no_data_request_is_pending(unsigned node);
															
 
																+int _starpu_check_that_no_data_request_exists(unsigned handling_node);
															
 
																+int _starpu_check_that_no_data_request_is_pending(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
															
 
																 struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
															
 
																 							 struct _starpu_data_replicate *src_replicate,
															
@@ -151,12 +171,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 							 int handling_node,
															
 
																 							 enum starpu_data_access_mode mode,
															
 
																 							 unsigned ndeps,
															
 
																+							 struct starpu_task *task,
															
 
																 							 enum starpu_is_prefetch is_prefetch,
															
 
																 							 int prio,
															
 
																 							 unsigned is_write_invalidation,
															
 
																 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
															
 
																-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
															
 
																+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
															
 
																 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
															
 
																 					  void (*callback_func)(void *),
															
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -26,19 +26,17 @@
 
																 #include <core/simgrid.h>
															
 
																 #endif
															
 
																-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
															
 
																+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
															
 
																 {
															
 
																 	int ret = 0;
															
 
																-
															
 
																-#ifdef STARPU_SIMGRID
															
 
																-	/* XXX */
															
 
																-	starpu_sleep(0.000001);
															
 
																-#endif
															
 
																-	STARPU_UYIELD();
															
 
																+	unsigned peer_node;
															
 
																 	/* in case some other driver requested data */
															
 
																-	if (_starpu_handle_pending_node_data_requests(memory_node))
															
 
																-		ret = 1;
															
 
																+	for (peer_node = peer_start; peer_node < peer_end; peer_node++)
															
 
																+	{
															
 
																+		if (_starpu_handle_pending_node_data_requests(memory_node, peer_node, inout))
															
 
																+			ret = 1;
															
 
																+	}
															
 
																 	starpu_memchunk_tidy(memory_node);
															
@@ -46,26 +44,70 @@ int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsi
 
																 	{
															
 
																 		/* Some transfers have finished, or the driver requests to really push more */
															
 
																 		unsigned pushed;
															
 
																-		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
															
 
																+		unsigned ok = 1;
															
 
																+
															
 
																+		for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
															
 
																 		{
															
 
																+			if (_starpu_handle_node_data_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
															
 
																+				ok = 0;
															
 
																 			if (pushed)
															
 
																 				ret = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (ok)
															
 
																+		{
															
 
																+			unsigned doidle = 1;
															
 
																+
															
 
																 			/* We pushed all pending requests, we can afford pushing
															
 
																 			 * prefetch requests */
															
 
																-			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
															
 
																-			if (_starpu_check_that_no_data_request_is_pending(memory_node))
															
 
																+			for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
															
 
																+			{
															
 
																+				if (_starpu_handle_node_prefetch_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
															
 
																+					ok = 0;
															
 
																+				if (pushed)
															
 
																+					ret = 1;
															
 
																+				if (!_starpu_check_that_no_data_request_is_pending(memory_node, peer_node, inout))
															
 
																+					doidle = 0;
															
 
																+			}
															
 
																+
															
 
																+			if (doidle)
															
 
																 				/* No pending transfer, push some idle transfer */
															
 
																-				_starpu_handle_node_idle_requests(memory_node, may_alloc, &pushed);
															
 
																+				for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
															
 
																+				{
															
 
																+					if (_starpu_handle_node_idle_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
															
 
																+						ok = 0;
															
 
																+					if (pushed)
															
 
																+						ret = 1;
															
 
																+				}
															
 
																 		}
															
 
																-		if (pushed)
															
 
																-			ret = 1;
															
 
																+
															
 
																 	}
															
 
																-	_starpu_execute_registered_progression_hooks();
															
 
																 	return ret;
															
 
																 }
															
 
																-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
															
 
																+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
															
 
																+{
															
 
																+	int ret = 0;
															
 
																+	unsigned peer_node;
															
 
																+
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+	/* XXX */
															
 
																+	starpu_sleep(0.000001);
															
 
																+#endif
															
 
																+	STARPU_UYIELD();
															
 
																+
															
 
																+	/* First handle all incoming transfers */
															
 
																+	ret |= ____starpu_datawizard_progress(memory_node, 0, nnodes, _STARPU_DATA_REQUEST_IN, may_alloc, push_requests);
															
 
																+
															
 
																+	/* Then handle outgoing transfers */
															
 
																+	for (peer_node = 0; peer_node < nnodes; peer_node++)
															
 
																+		ret |= ____starpu_datawizard_progress(memory_node, peer_node, peer_node+1, _STARPU_DATA_REQUEST_OUT, may_alloc, push_requests);
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
															
 
																 {
															
 
																 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																         unsigned memnode;
															
@@ -77,7 +119,8 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
																 		int nnumas = starpu_memory_nodes_get_numa_count();
															
 
																 		int numa;
															
 
																 		for (numa = 0; numa < nnumas; numa++)
															
 
																-			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
															
 
																+			ret |=  ___starpu_datawizard_progress(numa, nnumas, may_alloc, push_requests);
															
 
																+		_starpu_execute_registered_progression_hooks();
															
 
																 		return ret;
															
 
																 	}
															
@@ -87,19 +130,38 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
																 		worker = &worker->set->workers[0];
															
 
																 	unsigned current_worker_id = worker->workerid;
															
 
																-        int ret = 0;
															
 
																+	int ret = 0;
															
 
																 	unsigned nnodes = starpu_memory_nodes_get_count();
															
 
																-        for (memnode = 0; memnode < nnodes; memnode++)
															
 
																-        {
															
 
																-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
															
 
																-                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
															
 
																-        }
															
 
																+	for (memnode = 0; memnode < nnodes; memnode++)
															
 
																+	{
															
 
																+		if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
															
 
																+		{
															
 
																+			if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
															
 
																+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
															
 
																+			else
															
 
																+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
															
 
																+			}
															
 
																+	}
															
 
																+
															
 
																+	_starpu_execute_registered_progression_hooks();
															
 
																         return ret;
															
 
																 }
															
 
																-void _starpu_datawizard_progress(unsigned may_alloc)
															
 
																+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
															
 
																 {
															
 
																         __starpu_datawizard_progress(may_alloc, 1);
															
 
																 }
															
 
																+
															
 
																+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode)
															
 
																+{
															
 
																+	unsigned nnodes = starpu_memory_nodes_get_count();
															
 
																+	unsigned memnode2;
															
 
																+
															
 
																+	for (memnode2 = 0; memnode2 < nnodes; memnode2++)
															
 
																+	{
															
 
																+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_IN);
															
 
																+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_OUT);
															
 
																+	}
															
 
																+}
															
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -34,18 +34,19 @@
 
																 #include <core/dependencies/implicit_data_deps.h>
															
 
																-/** Make data transfers progress on node \p memory_node.
															
 
																+
															
 
																+/** Make data transfers progress on all memory nodes driven by the current worker.
															
 
																  *
															
 
																  * If \p push_requests is 1, it can start new transfers
															
 
																  *
															
 
																- * If \p may_alloc is 1, it can allocate destination data for transfers
															
 
																+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
															
 
																  * (this is not possible e.g. when spinning for a handle lock)
															
 
																  */
															
 
																-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
															
 
																-/** Call ___starpu_datawizard_progress() for all memory nodes driven by the
															
 
																- * current worker */
															
 
																-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
															
 
																+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
															
 
																 /** Call __starpu_datawizard_progress with push_requests = 1 */
															
 
																-void _starpu_datawizard_progress(unsigned may_alloc);
															
 
																+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
															
 
																+
															
 
																+/** Check for all pending data request progress on node \p memory_node */
															
 
																+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);
															
 
																 #endif // __DATAWIZARD_H__
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
																 		int home_node = initial_handle->home_node;
															
 
																 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
															
 
																 			home_node = STARPU_MAIN_RAM;
															
 
																-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
															
 
																+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning we should reclaim memory if allocation failed
															
 
																 #endif
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -375,13 +375,14 @@ _starpu_data_initialize_per_worker(starpu_data_handle_t handle)
 
																 		replicate->state = STARPU_INVALID;
															
 
																 		//replicate->refcnt = 0;
															
 
																 		replicate->handle = handle;
															
 
																-		//replicate->requested = 0;
															
 
																 		//replicate->nb_tasks_prefetch = 0;
															
 
																 		//for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 		//{
															
 
																 		//	replicate->request[node] = NULL;
															
 
																+		//	replicate->last_request[node] = NULL;
															
 
																 		//}
															
 
																+		//replicate->load_request = NULL;
															
 
																 		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
															
 
																 		replicate->relaxed_coherency = 1;
															
@@ -785,7 +786,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
																 	}
															
 
																 	if (valid)
															
 
																 	{
															
 
																-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
															
 
																+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, origin);
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
															
 
																 	}
															
@@ -1033,6 +1034,7 @@ retry_busy:
 
																 	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 	{
															
 
																 		struct _starpu_data_replicate *local = &handle->per_node[node];
															
 
																+		STARPU_ASSERT(!local->refcnt);
															
 
																 		if (local->allocated)
															
 
																 		{
															
 
																 			_starpu_data_unregister_ram_pointer(handle, node);
															
@@ -1049,6 +1051,7 @@ retry_busy:
 
																 		for (worker = 0; worker < nworkers; worker++)
															
 
																 		{
															
 
																 			struct _starpu_data_replicate *local = &handle->per_worker[worker];
															
 
																+			STARPU_ASSERT(!local->refcnt);
															
 
																 			/* free the data copy in a lazy fashion */
															
 
																 			if (local->allocated && local->automatically_allocated)
															
 
																 				_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -149,6 +149,15 @@ static int _starpu_malloc_should_pin(int flags)
 
																 	return 0;
															
 
																 }
															
 
																+int _starpu_malloc_willpin_on_node(unsigned dst_node)
															
 
																+{
															
 
																+	int flags = malloc_on_node_default_flags[dst_node];
															
 
																+	return (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0
															
 
																+			&& (_starpu_can_submit_cuda_task()
															
 
																+			    /* || _starpu_can_submit_opencl_task() */
															
 
																+			));
															
 
																+}
															
 
																+
															
 
																 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
															
 
																 {
															
 
																 	int ret=0;
															
@@ -185,6 +194,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
																 		goto end;
															
 
																 	}
															
 
																+	/* Note: synchronize this test with _starpu_malloc_willpin_on_node */
															
 
																 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
															
 
																 	{
															
 
																 		if (_starpu_can_submit_cuda_task())
															
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -26,4 +26,11 @@ void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
																 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
															
 
																 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
															
 
																+
															
 
																+/**
															
 
																+   Returns whether when allocating data on \p dst_node, we will do pinning, i.e.
															
 
																+   the allocation will be very expensive, and should thus be moved out from the
															
 
																+   critical path
															
 
																+  */
															
 
																+int _starpu_malloc_willpin_on_node(unsigned dst_node);
															
 
																 #endif
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -169,7 +169,10 @@ void _starpu_mem_chunk_disk_register(unsigned disk_memnode)
 
																 	{
															
 
																 		enum starpu_node_kind kind = starpu_node_get_kind(i);
															
 
																 		if (kind == STARPU_CPU_RAM)
															
 
																+		{
															
 
																+			STARPU_HG_DISABLE_CHECKING(evictable[i]);
															
 
																 			evictable[i] = 1;
															
 
																+		}
															
 
																 	}
															
 
																 }
															
@@ -327,7 +330,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
																 		{
															
 
																 			/* This is the only copy, push it to destination */
															
 
																 			struct _starpu_data_request *r;
															
 
																-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
															
 
																+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
															
 
																 			/* There is no way we don't need a request, since
															
 
																 			 * source is OWNER, destination can't be having it */
															
 
																 			STARPU_ASSERT(r);
															
@@ -552,8 +555,9 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																+	STARPU_ASSERT(node < STARPU_MAXNODES);
															
 
																 	/* This data should be written through to this node, avoid dropping it! */
															
 
																-	if (handle->wt_mask & (1<<node))
															
 
																+	if (node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
															
 
																 		return 0;
															
 
																 	/* This data was registered from this node, we will not be able to drop it anyway */
															
@@ -1012,7 +1016,7 @@ restart2:
 
																 				next_mc->remove_notify = &next_mc;
															
 
																 			}
															
 
																 			/* Note: this may unlock mc_list! */
															
 
																-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
															
 
																+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, is_prefetch);
															
 
																 			if (orig_next_mc)
															
 
																 			{
															
@@ -1179,7 +1183,7 @@ void starpu_memchunk_tidy(unsigned node)
 
																 			if (
															
 
																 				/* This data should be written through to this node, avoid
															
 
																 				 * dropping it! */
															
 
																-				handle->wt_mask & (1<<node)
															
 
																+				(node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
															
 
																 				/* This is partitioned, don't care about the
															
 
																 				 * whole data, we'll work on the subdatas.  */
															
 
																 			     || handle->nchildren
															
@@ -1231,7 +1235,7 @@ void starpu_memchunk_tidy(unsigned node)
 
																 			}
															
 
																 			_starpu_spin_unlock(&mc_lock[node]);
															
 
																-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
															
 
																+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, NULL, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
															
 
																 			{
															
 
																 				/* No request was actually needed??
															
 
																 				 * Odd, but cope with it.  */
															
@@ -1442,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
																  *
															
 
																  */
															
 
																-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
															
 
																+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
															
 
																 {
															
 
																 	unsigned attempts = 0;
															
 
																 	starpu_ssize_t allocated_memory;
															
@@ -1473,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 	if (!prefetch_oom)
															
 
																 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
															
 
																 #endif
															
 
																+
															
 
																+	/* If this is RAM and pinned this will be slow
															
 
																+	   In case we only want fast allocations return here */
															
 
																+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
															
 
																+		return -ENOMEM;
															
 
																+
															
 
																 	STARPU_ASSERT(handle->ops);
															
 
																 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
															
 
																 	STARPU_ASSERT(replicate->data_interface);
															
@@ -1576,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 	{
															
 
																 		cpt++;
															
 
																-		_starpu_datawizard_progress(0);
															
 
																+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
															
 
																 	}
															
 
																 	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
@@ -1620,7 +1630,7 @@ out:
 
																 	return allocated_memory;
															
 
																 }
															
 
																-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
															
 
																+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
															
 
																 {
															
 
																 	starpu_ssize_t allocated_memory;
															
@@ -1635,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
																 		return 0;
															
 
																 	STARPU_ASSERT(replicate->data_interface);
															
 
																-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
															
 
																+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
															
 
																 	/* perhaps we could really not handle that capacity misses */
															
 
																 	if (allocated_memory == -ENOMEM)
															
@@ -1845,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
																 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																 			for (i=0; i<nb_numa_nodes; i++)
															
 
																 			{
															
 
																-				if (handle->per_node[i].allocated || 
															
 
																+				if (handle->per_node[i].allocated ||
															
 
																 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
															
 
																 				{
															
 
																 					target = i;
															
@@ -1877,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
																 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																 			for (i=0; i<nb_numa_nodes; i++)
															
 
																 			{
															
 
																-				if (handle->per_node[i].allocated || 
															
 
																+				if (handle->per_node[i].allocated ||
															
 
																 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
															
 
																 				{
															
 
																 					target = i;
															
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 
																 void _starpu_deinit_mem_chunk_lists(void);
															
 
																 void _starpu_mem_chunk_init_last(void);
															
 
																 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
															
 
																-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
															
 
																+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
															
 
																 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
															
 
																 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
															
 
																 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
															
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -151,6 +151,7 @@ void _starpu_memory_node_register_condition(struct _starpu_worker *worker, starp
 
																 #undef starpu_worker_get_memory_node
															
 
																 unsigned starpu_worker_get_memory_node(unsigned workerid)
															
 
																 {
															
 
																+	(void) workerid;
															
 
																 	return _starpu_worker_get_memory_node(workerid);
															
 
																 }
															
@@ -167,12 +168,10 @@ void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned m
 
																 	}
															
 
																 }
															
 
																+#undef starpu_worker_get_local_memory_node
															
 
																 unsigned starpu_worker_get_local_memory_node(void)
															
 
																 {
															
 
																-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																-	if (!worker)
															
 
																-		return STARPU_MAIN_RAM;
															
 
																-	return worker->memory_node;
															
 
																+	return _starpu_worker_get_local_memory_node();
															
 
																 }
															
 
																 int starpu_memory_node_get_devid(unsigned node)
															
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -117,12 +117,19 @@ static inline enum starpu_node_kind _starpu_node_get_kind(unsigned node)
 
																 }
															
 
																 #define starpu_node_get_kind _starpu_node_get_kind
															
 
																+#if STARPU_MAXNODES == 1
															
 
																+#define _starpu_memory_nodes_get_count() 1
															
 
																+#else
															
 
																 static inline unsigned _starpu_memory_nodes_get_count(void)
															
 
																 {
															
 
																 	return _starpu_descr.nnodes;
															
 
																 }
															
 
																+#endif
															
 
																 #define starpu_memory_nodes_get_count _starpu_memory_nodes_get_count
															
 
																+#if STARPU_MAXNODES == 1
															
 
																+#define _starpu_worker_get_memory_node(workerid) 0
															
 
																+#else
															
 
																 static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
															
 
																 {
															
 
																 	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
@@ -139,6 +146,20 @@ static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 
																 	return config->combined_workers[workerid - nworkers].memory_node;
															
 
																 }
															
 
																+#endif
															
 
																 #define starpu_worker_get_memory_node _starpu_worker_get_memory_node
															
 
																+#if STARPU_MAXNODES == 1
															
 
																+#define _starpu_worker_get_local_memory_node() 0
															
 
																+#else
															
 
																+static inline unsigned _starpu_worker_get_local_memory_node(void)
															
 
																+{
															
 
																+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
															
 
																+	if (!worker)
															
 
																+		return STARPU_MAIN_RAM;
															
 
																+	return worker->memory_node;
															
 
																+}
															
 
																+#endif
															
 
																+#define starpu_worker_get_local_memory_node _starpu_worker_get_local_memory_node
															
 
																+
															
 
																 #endif // __MEMORY_NODES_H__
															
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -280,12 +280,21 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
																 					redux_task->cl = handle->redux_cl;
															
 
																 					STARPU_ASSERT(redux_task->cl);
															
 
																 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
															
 
																-						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
															
 
																+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW|STARPU_COMMUTE, 0);
															
 
																 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
															
 
																 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
															
 
																-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
															
 
																+					STARPU_ASSERT_MSG((STARPU_CODELET_GET_MODE(redux_task->cl, 0) & ~STARPU_COMMUTE) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
															
 
																 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
															
 
																+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0) & STARPU_COMMUTE))
															
 
																+					{
															
 
																+						static int warned;
															
 
																+						if (!warned)
															
 
																+						{
															
 
																+							warned = 1;
															
 
																+							_STARPU_DISP("Warning: for reductions, codelet %p should have STARPU_COMMUTE along STARPU_RW\n", redux_task->cl);
															
 
																+						}
															
 
																+					}
															
 
																 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
															
 
																 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -53,7 +53,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
															
 
																+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, NULL, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
															
 
																 	/* we do not increase the refcnt associated to the request since we are
															
 
																 	 * not waiting for its termination */
															
@@ -126,7 +126,7 @@ static inline void _starpu_data_acquire_launch_fetch(struct user_interaction_wra
 
																 	starpu_data_handle_t handle = wrapper->handle;
															
 
																 	struct _starpu_data_replicate *replicate = node >= 0 ? &handle->per_node[node] : NULL;
															
 
																-	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
															
 
																+	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, NULL, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
															
 
																 	STARPU_ASSERT(!ret);
															
 
																 }
															
@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
																 							  void (*callback)(void *arg),
															
 
																 							  void *arg,
															
 
																 							  int sequential_consistency, int quick,
															
 
																-							  long *pre_sync_jobid, long *post_sync_jobid)
															
 
																+							  long *pre_sync_jobid, long *post_sync_jobid, int prio)
															
 
																 {
															
 
																 	STARPU_ASSERT(handle);
															
 
																 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
															
@@ -211,6 +211,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
																 	wrapper->callback_arg = arg;
															
 
																 	wrapper->pre_sync_task = NULL;
															
 
																 	wrapper->post_sync_task = NULL;
															
 
																+	wrapper->prio = prio;
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
															
 
																 	int handle_sequential_consistency = handle->sequential_consistency;
															
@@ -225,6 +226,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
																 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
															
 
																 		wrapper->pre_sync_task->callback_arg = wrapper;
															
 
																 		wrapper->pre_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
															
 
																+		wrapper->pre_sync_task->priority = prio;
															
 
																 		pre_sync_job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
															
 
																 		if (pre_sync_jobid)
															
 
																 			*pre_sync_jobid = pre_sync_job->job_id;
															
@@ -233,6 +235,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
																 		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
															
 
																 		wrapper->post_sync_task->detach = 1;
															
 
																 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
															
 
																+		wrapper->post_sync_task->priority = prio;
															
 
																 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
															
 
																 		if (post_sync_jobid)
															
 
																 			*post_sync_jobid = post_sync_job->job_id;
															
@@ -280,7 +283,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
																 							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
															
 
																 							  int sequential_consistency, int quick)
															
 
																 {
															
 
																-	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL);
															
 
																+	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL, STARPU_DEFAULT_PRIO);
															
 
																 }
															
 
																 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node,
															
@@ -616,7 +619,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
																 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
 
																 {
															
 
																-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
															
 
																+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, STARPU_DEFAULT_PRIO);
															
 
																 }
															
 
																 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
															
@@ -626,7 +629,7 @@ int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node
 
																 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
 
																 {
															
 
																-	return starpu_data_prefetch_on_node_prio(handle, node, async, 0);
															
 
																+	return starpu_data_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
															
 
																 }
															
 
																 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
															
@@ -636,7 +639,7 @@ int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned
 
																 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
 
																 {
															
 
																-	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, 0);
															
 
																+	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
															
 
																 }
															
 
																 static void _starpu_data_wont_use(void *data)
															
@@ -817,7 +820,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 
																 		unsigned node;
															
 
																 		for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 		{
															
 
																-			if (handle->per_node[memory_node].requested & (1UL << node))
															
 
																+			if (handle->per_node[memory_node].request[node])
															
 
																 			{
															
 
																 				requested = 1;
															
 
																 				break;
															
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
																 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																 				{
															
 
																 					cpt++;
															
 
																-					__starpu_datawizard_progress(1, 1);
															
 
																+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
															
 
																 				}
															
 
																 				if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 					_starpu_spin_lock(&handle->header_lock);
															
@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
																 				struct _starpu_data_request *r;
															
 
																 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
															
 
																-									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																+									 STARPU_R, NULL, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																 			        /* If no request was created, the handle was already up-to-date on the
															
 
																 			         * node */
															
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
															
 
																-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
															
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
															
 
																-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
															
 
																 	}
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -251,11 +251,12 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 
																 		fprintf(tasks_file, "\n");
															
 
																 		fprintf(tasks_file, "Modes:");
															
 
																 		for (i = 0; i < task->ndata; i++)
															
 
																-			fprintf(tasks_file, " %s%s%s%s%s",
															
 
																+			fprintf(tasks_file, " %s%s%s%s%s%s",
															
 
																 				(task->data[i].mode & STARPU_R)?"R":"",
															
 
																 				(task->data[i].mode & STARPU_W)?"W":"",
															
 
																 				(task->data[i].mode & STARPU_SCRATCH)?"S":"",
															
 
																 				(task->data[i].mode & STARPU_REDUX)?"X":"",
															
 
																+				(task->data[i].mode & STARPU_MPI_REDUX)?"X-mpi":"",
															
 
																 				(task->data[i].mode & STARPU_COMMUTE)?"C":"");
															
 
																 		fprintf(tasks_file, "\n");
															
 
																 		fprintf(tasks_file, "Sizes:");
															
@@ -763,15 +764,20 @@ static void memnode_pop_state(double time, const char *prefix, unsigned int memn
 
																 #endif
															
 
																 }
															
 
																-static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long info, unsigned long size, unsigned int dest, struct starpu_fxt_options *options)
															
 
																+static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long value, unsigned long info, long size_prio, unsigned int dest, struct starpu_fxt_options *options)
															
 
																 {
															
 
																 	if (!options->memory_states)
															
 
																 		return;
															
 
																+	// If there is not a valid memory node, we cant associate it
															
 
																+	if((int)memnodeid < 0)
															
 
																+		return;
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char container[STARPU_POTI_STR_LEN];
															
 
																 	char p_handle[STARPU_POTI_STR_LEN];
															
 
																+	char p_value[STARPU_POTI_STR_LEN];
															
 
																 	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
															
 
																 	snprintf(p_handle, sizeof(p_handle), "%lx", handle);
															
 
																+	snprintf(p_value, sizeof(p_value), "%lx", value);
															
 
																 #ifdef HAVE_POTI_USER_NEWEVENT
															
 
																 	char p_dest[STARPU_POTI_STR_LEN];
															
@@ -780,15 +786,15 @@ static void memnode_event(double time, const char *prefix, unsigned int memnodei
 
																 	memmanager_container_alias(p_dest, STARPU_POTI_STR_LEN, prefix, dest);
															
 
																 	snprintf(p_info, sizeof(p_info), "%lu", info);
															
 
																-	snprintf(p_size, sizeof(p_size), "%lu", size);
															
 
																+	snprintf(p_size, sizeof(p_size), "%ld", size_prio);
															
 
																-	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, "0", 4,
															
 
																+	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, p_value, 4,
															
 
																 			   p_handle, p_info, p_size, p_dest);
															
 
																 #else
															
 
																 	poti_NewEvent(time, container, name, p_handle);
															
 
																 #endif
															
 
																 #else
															
 
																-	fprintf(out_paje_file, "22    %.9f    %s %smm%u  0 %lx %lu %lu %smm%u\n", time, name, prefix, memnodeid, handle, info, size, prefix, dest);
															
 
																+	fprintf(out_paje_file, "22    %.9f    %s %smm%u  %lx %lx %lu %ld %smm%u\n", time, name, prefix, memnodeid, value, handle, info, size_prio, prefix, dest);
															
 
																 #endif
															
 
																 }
															
@@ -2232,7 +2238,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
																 		{
															
 
																 			double time = get_event_time_stamp(ev, options);
															
 
																 			memnode_push_state(time, prefix, dst, "Co");
															
 
																-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, comid, size, src, options);
															
 
																+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, 0, comid, size, src, options);
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_memnode_container[STARPU_POTI_STR_LEN];
															
 
																 			char program_container[STARPU_POTI_STR_LEN];
															
@@ -2351,7 +2357,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 		{
															
 
																 			double time = get_event_time_stamp(ev, options);
															
 
																 			memnode_pop_state(time, prefix, dst);
															
 
																-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, comid, size, src, options);
															
 
																+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, 0, comid, size, src, options);
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
															
 
																 			char dst_memnode_container[STARPU_POTI_STR_LEN], program_container[STARPU_POTI_STR_LEN];
															
@@ -2378,7 +2384,7 @@ static void handle_start_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_f
 
																 		if (out_paje_file)
															
 
																 		{
															
 
																 			memnode_push_state(get_event_time_stamp(ev, options), prefix, dst, "CoA");
															
 
																-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, src, options);
															
 
																+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, 0, src, options);
															
 
																 		}
															
 
																 }
															
@@ -2394,7 +2400,7 @@ static void handle_end_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt
 
																 		if (out_paje_file)
															
 
																 		{
															
 
																 			memnode_pop_state(get_event_time_stamp(ev, options), prefix, dst);
															
 
																-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, src, options);
															
 
																+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, 0, src, options);
															
 
																 		}
															
 
																 }
															
@@ -2408,32 +2414,36 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
															
 
																 }
															
 
																+static void handle_data_request(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
 
																+{
															
 
																+	unsigned memnode = ev->param[0];
															
 
																+	unsigned dest = ev->param[1];
															
 
																+	unsigned prio = ev->param[2];
															
 
																+	unsigned long handle = ev->param[3];
															
 
																+	unsigned prefe = ev->param[4];
															
 
																+	unsigned long request = ev->param[5];
															
 
																+
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, request, prefe, prio, dest, options);
															
 
																+}
															
 
																+
															
 
																 static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
 
																 {
															
 
																 	unsigned memnode = ev->param[0];
															
 
																 	unsigned size = ev->param[2];
															
 
																 	unsigned long handle = ev->param[3];
															
 
																-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, size, memnode, options);
															
 
																 }
															
 
																 static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
 
																 {
															
 
																 	unsigned memnode = ev->param[0];
															
 
																-	unsigned dest = ev->param[1];
															
 
																-	if(strcmp(eventstr, "rc")==0)
															
 
																-	{
															
 
																-		//If it is a Request Create, use dest normally
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		dest = memnode;
															
 
																-	}
															
 
																+	//unsigned dest = ev->param[1]; // Not used
															
 
																 	unsigned size = ev->param[2];
															
 
																 	unsigned long handle = ev->param[3];
															
 
																 	unsigned prefe = ev->param[4];
															
 
																-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, prefe, size, memnode, options);
															
 
																 }
															
 
																 static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
@@ -2442,7 +2452,7 @@ static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_o
 
																 	unsigned long handle = ev->param[2];
															
 
																 	unsigned info = ev->param[3];
															
 
																-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, info, 0, memnode, options);
															
 
																 }
															
 
																 static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
@@ -2450,7 +2460,7 @@ static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt
 
																 	unsigned memnode = ev->param[0];
															
 
																 	unsigned long handle = ev->param[2];
															
 
																-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
															
 
																 }
															
 
																 static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
@@ -2458,7 +2468,7 @@ static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_o
 
																 	unsigned memnode = ev->param[0];
															
 
																 	unsigned long handle = ev->param[2];
															
 
																-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
															
 
																+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
															
 
																 }
															
 
																 static void handle_push_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
															
@@ -3702,13 +3712,12 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
																 				if (options->memory_states)
															
 
																 					handle_data_state(&ev, options, "SS");
															
 
																 				break;
															
 
																-                       case _STARPU_FUT_DATA_REQUEST_CREATED:
															
 
																-                               if (!options->no_bus && options->memory_states)
															
 
																-                               {
															
 
																-                                       handle_memnode_event_start_4(&ev, options, "rc");
															
 
																-                               }
															
 
																-                               break;
															
 
																-
															
 
																+			case _STARPU_FUT_DATA_REQUEST_CREATED:
															
 
																+				if (!options->no_bus && options->memory_states)
															
 
																+				{
															
 
																+					handle_data_request(&ev, options, "rc");
															
 
																+				}
															
 
																+				break;
															
 
																 		  case _STARPU_FUT_PAPI_TASK_EVENT_VALUE:
															
 
																 				handle_papi_event(&ev, options);
															
 
																 				break;
															
@@ -4207,18 +4216,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
																 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																 		free(options->worker_archtypes[i].devices);
															
 
																-	struct _starpu_symbol_name *itor, *next;
															
 
																-	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
															
 
																-		itor != _starpu_symbol_name_list_end(&symbol_list);
															
 
																-		itor = next)
															
 
																-	{
															
 
																-		next = _starpu_symbol_name_list_next(itor);
															
 
																-
															
 
																-		_starpu_symbol_name_list_erase(&symbol_list, itor);
															
 
																-		free(itor->name);
															
 
																-		_starpu_symbol_name_delete(itor);
															
 
																-	}
															
 
																-
															
 
																 	_starpu_fxt_component_deinit();
															
 
																 	free_worker_ids();
															
@@ -4608,6 +4605,17 @@ void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 
																 static
															
 
																 void _starpu_fxt_paje_file_close(void)
															
 
																 {
															
 
																+	struct _starpu_symbol_name *itor, *next;
															
 
																+	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
															
 
																+		itor != _starpu_symbol_name_list_end(&symbol_list);
															
 
																+		itor = next)
															
 
																+	{
															
 
																+		next = _starpu_symbol_name_list_next(itor);
															
 
																+
															
 
																+		_starpu_symbol_name_list_erase(&symbol_list, itor);
															
 
																+		free(itor->name);
															
 
																+		_starpu_symbol_name_delete(itor);
															
 
																+	}
															
 
																 	if (out_paje_file)
															
 
																 		fclose(out_paje_file);
															
 
																 }
															
@@ -4658,6 +4666,7 @@ uint64_t _starpu_fxt_find_start_time(char *filename_in)
 
																 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
															
 
																 {
															
 
																+	starpu_drivers_preinit();
															
 
																 	_starpu_fxt_options_set_dir(options);
															
 
																 	_starpu_fxt_dag_init(options->dag_path);
															
 
																 	_starpu_fxt_distrib_file_init(options);
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -40,6 +40,7 @@
 
																 #include <datawizard/memory_manager.h>
															
 
																 #include <datawizard/memory_nodes.h>
															
 
																 #include <datawizard/malloc.h>
															
 
																+#include <datawizard/datawizard.h>
															
 
																 #include <core/simgrid.h>
															
 
																 #include <core/task.h>
															
 
																 #include <core/disk.h>
															
@@ -341,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
																 		return ret;
															
 
																 	}
															
 
																-	res = __starpu_datawizard_progress(1, 1);
															
 
																+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
															
 
																 	if (!pending_task)
															
 
																 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
															
@@ -429,7 +430,7 @@ int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
 
																 	_STARPU_TRACE_WORKER_DEINIT_START;
															
 
																 	unsigned memnode = cpu_worker->memory_node;
															
 
																-	_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
															
 
																 	/* In case there remains some memory that was automatically
															
 
																 	 * allocated by StarPU, we release it now. Note that data
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -37,6 +37,7 @@
 
																 #include <datawizard/memory_manager.h>
															
 
																 #include <datawizard/memory_nodes.h>
															
 
																 #include <datawizard/malloc.h>
															
 
																+#include <datawizard/datawizard.h>
															
 
																 #include <core/task.h>
															
 
																 #include <common/knobs.h>
															
@@ -935,14 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 	if (!idle_tasks)
															
 
																 	{
															
 
																 		/* No task ready yet, no better thing to do than waiting */
															
 
																-		__starpu_datawizard_progress(1, !idle_transfers);
															
 
																+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
															
 
																 		return 0;
															
 
																 	}
															
 
																 #endif
															
 
																 	/* Something done, make some progress */
															
 
																-	res = !idle_tasks || !idle_transfers;
															
 
																-	res |= __starpu_datawizard_progress(1, 1);
															
 
																+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
															
 
																 	/* And pull tasks */
															
 
																 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
															
@@ -950,9 +950,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 #ifdef STARPU_SIMGRID
															
 
																 	if (!res)
															
 
																 		starpu_pthread_wait_wait(&worker0->wait);
															
 
																-#else
															
 
																-	if (!res)
															
 
																-		return 0;
															
 
																 #endif
															
 
																 	for (i = 0; i < (int) worker_set->nworkers; i++)
															
@@ -972,35 +969,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 		{
															
 
																 			/* this is neither a cuda or a cublas task */
															
 
																 			_starpu_worker_refuse_task(worker, task);
															
 
																-#if 0
															
 
																-			if (worker->pipeline_length)
															
 
																-			{
															
 
																-				int j;
															
 
																-				for (j = 0; j < worker->ntasks; j++)
															
 
																-				{
															
 
																-					const int j_mod = (j+worker->first_task)%STARPU_MAX_PIPELINE;
															
 
																-					if (task == worker->current_tasks[j_mod])
															
 
																-					{
															
 
																-						worker->current_tasks[j_mod] = NULL;
															
 
																-						if (j == 0)
															
 
																-						{
															
 
																-							worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
															
 
																-							_starpu_set_current_task(NULL);
															
 
																-						}
															
 
																-						break;
															
 
																-					}
															
 
																-				}
															
 
																-				STARPU_ASSERT(j<worker->ntasks);
															
 
																-			}
															
 
																-			else
															
 
																-			{
															
 
																-				worker->current_task = NULL;
															
 
																-				_starpu_set_current_task(NULL);
															
 
																-			}
															
 
																-			worker->ntasks--;
															
 
																-			int res = _starpu_push_task_to_workers(task);
															
 
																-			STARPU_ASSERT_MSG(res == 0, "_starpu_push_task_to_workers() unexpectedly returned = %d\n", res);
															
 
																-#endif
															
 
																 			continue;
															
 
																 		}
															
@@ -1039,7 +1007,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 
																 		if (!usersleft)
															
 
																                 {
															
 
																 			/* I'm last, deinitialize device */
															
 
																-			_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+			_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
															
 
																 			/* In case there remains some memory that was automatically
															
 
																 			 * allocated by StarPU, we release it now. Note that data
															
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
																 		}
															
 
																 	}
															
 
																-        res |= __starpu_datawizard_progress(1, 1);
															
 
																+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
															
 
																         /* Handle message which have been store */
															
 
																         _starpu_src_common_handle_stored_async(mp_node);
															
@@ -1075,7 +1075,7 @@ void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set, int
 
																         for (device = 0; device < ndevices; device++)
															
 
																 	{
															
 
																         	_STARPU_TRACE_END_PROGRESS(memnode[device]);
															
 
																-                _starpu_handle_all_pending_node_data_requests(memnode[device]);
															
 
																+                _starpu_datawizard_handle_all_pending_node_data_requests(memnode[device]);
															
 
																 	}
															
 
																         /* In case there remains some memory that was automatically
															
@@ -1107,7 +1107,7 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, unsigned
 
																         _STARPU_TRACE_END_PROGRESS(memnode);
															
 
																-        _starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+        _starpu_datawizard_handle_all_pending_node_data_requests(memnode);
															
 
																         /* In case there remains some memory that was automatically
															
 
																          * allocated by StarPU, we release it now. Note that data
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -31,6 +31,7 @@
 
																 #include <datawizard/memory_manager.h>
															
 
																 #include <datawizard/memory_nodes.h>
															
 
																 #include <datawizard/malloc.h>
															
 
																+#include <datawizard/datawizard.h>
															
 
																 #include <core/task.h>
															
 
																 #include <common/knobs.h>
															
@@ -787,13 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
																 	if (!idle_tasks)
															
 
																 	{
															
 
																 		/* No task ready yet, no better thing to do than waiting */
															
 
																-		__starpu_datawizard_progress(1, !idle_transfers);
															
 
																+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
															
 
																 		return 0;
															
 
																 	}
															
 
																 #endif
															
 
																-	res = !idle_tasks || !idle_transfers;
															
 
																-	res |= __starpu_datawizard_progress(1, 1);
															
 
																+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
															
 
																 	task = _starpu_get_worker_task(worker, workerid, memnode);
															
@@ -840,7 +840,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 
																 	unsigned memnode = worker->memory_node;
															
 
																-	_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
															
 
																 	/* In case there remains some memory that was automatically
															
 
																 	 * allocated by StarPU, we release it now. Note that data
															
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -114,6 +114,9 @@ int starpu_profiling_status_set(int status)
 
																 	{
															
 
																 		struct _starpu_worker *worker_struct = _starpu_get_worker_struct(worker);
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&worker_struct->sched_mutex);
															
 
																+	}
															
 
																+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
															
 
																+	{
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[worker]);
															
 
																 	}
															
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -112,7 +112,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 
																 	}
															
 
																 	if(task)
															
 
																 		/* this worker can execute this task as it was returned by a pop*/
															
 
																-		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id_check());
															
 
																+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_bitmap_first(&component->workers_in_ctx));
															
 
																 	return task;
															
 
																 }
															
--- a/src/sched_policies/component_fifo.c
+++ b/src/sched_policies/component_fifo.c
@@ -180,8 +180,10 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 
																 	struct starpu_task * task;
															
 
																 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
															
 
																 		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
															
 
																+	else if (to->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
															
 
																+		task = _starpu_fifo_pop_task(queue, starpu_bitmap_first(&to->workers_in_ctx));
															
 
																 	else
															
 
																-		task = _starpu_fifo_pop_task(queue, starpu_worker_get_id_check());
															
 
																+		task = _starpu_fifo_pop_task(queue, -1);
															
 
																 	if(task && data->exp)
															
 
																 	{
															
 
																 		if(!isnan(task->predicted))
															
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -443,8 +443,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
																 		if(task)
															
 
																 		{
															
 
																 			_starpu_worker_task_list_transfer_started(list, task);
															
 
																-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 			starpu_push_task_end(task);
															
 
																+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 			goto ret;
															
 
																 		}
															
 
																 		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
@@ -470,8 +470,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
																 			STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
															
 
																 			_starpu_worker_task_list_add(list, task);
															
 
																 			_starpu_worker_task_list_transfer_started(list, task);
															
 
																-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 			starpu_push_task_end(task);
															
 
																+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 			goto ret;
															
 
																 		}
															
 
																 		struct starpu_sched_component * combined_worker_component = starpu_sched_component_worker_get(component->tree->sched_ctx_id, workerid);
															
@@ -486,8 +486,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 
																 		STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
															
 
																 		_starpu_worker_task_list_add(list, task);
															
 
																 		_starpu_worker_task_list_transfer_started(list, task);
															
 
																-		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 		starpu_push_task_end(task);
															
 
																+		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
															
 
																 	}
															
 
																 ret:
															
 
																 	return task;
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -352,6 +352,29 @@ int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_
 
																 	return ((num_priorities-1)/(max-min)) * (priority - min);
															
 
																 }
															
 
																+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker)
															
 
																+{
															
 
																+	size_t cnt = 0;
															
 
																+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
															
 
																+	unsigned index;
															
 
																+
															
 
																+	for (index = 0; index < nbuffers; index++)
															
 
																+	{
															
 
																+		starpu_data_handle_t handle;
															
 
																+		unsigned buffer_node = _starpu_task_data_get_node_on_worker(task, index, worker);
															
 
																+
															
 
																+		handle = STARPU_TASK_GET_HANDLE(task, index);
															
 
																+
															
 
																+		int is_valid;
															
 
																+		starpu_data_query_status(handle, buffer_node, NULL, &is_valid, NULL);
															
 
																+
															
 
																+		if (!is_valid)
															
 
																+			cnt+=starpu_data_get_size(handle);
															
 
																+	}
															
 
																+
															
 
																+	return cnt;
															
 
																+}
															
 
																+
															
 
																 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker)
															
 
																 {
															
 
																 	int cnt = 0;
															
@@ -392,7 +415,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
																 		int first_task_priority = task->priority;
															
 
																-		int non_ready_best = INT_MAX;
															
 
																+		size_t non_ready_best = SIZE_MAX;
															
 
																 		for (current = task; current; current = current->next)
															
 
																 		{
															
@@ -400,7 +423,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
																 			if (priority >= first_task_priority)
															
 
																 			{
															
 
																-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
															
 
																+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
															
 
																 				if (non_ready < non_ready_best)
															
 
																 				{
															
 
																 					non_ready_best = non_ready;
															
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -69,6 +69,7 @@ struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo)
 
																 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, int workerid);
															
 
																 int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_id);
															
 
																 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker);
															
 
																+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker);
															
 
																 struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned workerid, int num_priorities);
															
 
																 #endif // __FIFO_QUEUES_H__
															
--- a/src/sched_policies/prio_deque.c
+++ b/src/sched_policies/prio_deque.c
@@ -94,7 +94,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
																 			return NULL;
															
 
																 		int first_task_priority = task->priority;
															
 
																-		int non_ready_best = INT_MAX;
															
 
																+		size_t non_ready_best = SIZE_MAX;
															
 
																 		for (current = starpu_task_prio_list_begin(&pdeque->list);
															
 
																 		     current != starpu_task_prio_list_end(&pdeque->list);
															
@@ -104,7 +104,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
																 			if (priority >= first_task_priority)
															
 
																 			{
															
 
																-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
															
 
																+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
															
 
																 				if (non_ready < non_ready_best)
															
 
																 				{
															
 
																 					non_ready_best = non_ready;
															
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -610,6 +610,11 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
																 	if (_starpu_worker_trylock(victim))
															
 
																 	{
															
 
																 		/* victim is busy, don't bother it, come back later */
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+		starpu_sleep(0.000001);
															
 
																+		/* Make sure we come back and not block */
															
 
																+		starpu_wake_worker_no_relax(workerid);
															
 
																+#endif
															
 
																 		return NULL;
															
 
																 	}
															
 
																 	if (ws->per_worker[victim].running && ws->per_worker[victim].queue.ntasks > 0)
															
--- a/src/util/execute_on_all.c
+++ b/src/util/execute_on_all.c