瀏覽代碼

Merge branch 'master' of gitlab.inria.fr:starpu/starpu

HE Kun 4 年之前
父節點
當前提交
83bc792574
共有 100 個文件被更改,包括 2708 次插入883 次删除
  1. 10 1
      .gitlab-ci.yml
  2. 1 0
      AUTHORS
  3. 3 0
      ChangeLog
  4. 2 0
      Makefile.am
  5. 21 16
      configure.ac
  6. 18 3
      contrib/ci.inria.fr/job-1-check.sh
  7. 22 0
      contrib/gitlab/simgrid.sh
  8. 1 1
      doc/doxygen/chapters/101_building.doxy
  9. 2 1
      doc/doxygen/chapters/310_data_management.doxy
  10. 22 34
      doc/doxygen/chapters/320_scheduling.doxy
  11. 4 4
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  12. 31 0
      doc/doxygen/chapters/410_mpi_support.doxy
  13. 31 0
      doc/doxygen/chapters/501_environment_variables.doxy
  14. 1 1
      doc/doxygen/chapters/code/disk_copy.c
  15. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
  16. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
  17. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
  18. 1 1
      examples/Makefile.am
  19. 6 2
      examples/basic_examples/multiformat_conversion_codelets.c
  20. 54 158
      examples/cg/cg.c
  21. 0 25
      examples/cg/cg.h
  22. 216 37
      examples/cg/cg_kernels.c
  23. 1 1
      examples/pi/pi_redux.c
  24. 1 1
      examples/reductions/dot_product.c
  25. 1 1
      examples/reductions/minmax_reduction.c
  26. 13 0
      include/fstarpu_mod.f90
  27. 8 0
      include/starpu.h
  28. 14 4
      include/starpu_data.h
  29. 8 0
      include/starpu_hash.h
  30. 3 5
      include/starpu_perfmodel.h
  31. 28 2
      include/starpu_task.h
  32. 4 0
      include/starpu_util.h
  33. 2 2
      julia/README
  34. 2 2
      julia/examples/execute.sh.in
  35. 2 2
      julia/setenv.sh
  36. 1 1
      julia/src/StarPU.jl
  37. 89 2
      mpi/examples/Makefile.am
  38. 422 0
      mpi/examples/cg/cg.c
  39. 201 0
      mpi/examples/mpi_redux/mpi_redux.c
  40. 253 0
      mpi/examples/native_fortran/nf_mpi_redux.f90
  41. 238 0
      mpi/examples/native_fortran/nf_redux_test.f90
  42. 9 0
      mpi/include/starpu_mpi.h
  43. 0 1
      mpi/src/mpi/starpu_mpi_early_data.h
  44. 40 34
      mpi/src/mpi/starpu_mpi_mpi.c
  45. 0 2
      mpi/src/mpi/starpu_mpi_mpi_backend.c
  46. 0 1
      mpi/src/mpi/starpu_mpi_mpi_backend.h
  47. 28 9
      mpi/src/starpu_mpi.c
  48. 1 2
      mpi/src/starpu_mpi_coop_sends.c
  49. 16 6
      mpi/src/starpu_mpi_private.h
  50. 55 20
      mpi/src/starpu_mpi_task_insert.c
  51. 1 1
      mpi/src/starpu_mpi_task_insert_fortran.c
  52. 1 1
      mpi/tests/mpi_reduction.c
  53. 3 0
      mpi/tests/mpi_redux.c
  54. 20 20
      src/common/fxt.h
  55. 5 0
      src/common/hash.c
  56. 4 4
      src/common/uthash.h
  57. 1 1
      src/core/dependencies/data_arbiter_concurrency.c
  58. 2 2
      src/core/dependencies/data_concurrency.c
  59. 6 2
      src/core/dependencies/implicit_data_deps.c
  60. 1 2
      src/core/jobs.c
  61. 18 11
      src/core/perfmodel/energy_model.c
  62. 8 4
      src/core/perfmodel/perfmodel_bus.c
  63. 4 3
      src/core/perfmodel/perfmodel_history.c
  64. 1 20
      src/core/sched_policy.c
  65. 0 2
      src/core/sched_policy.h
  66. 10 0
      src/core/workers.c
  67. 78 61
      src/datawizard/coherency.c
  68. 10 10
      src/datawizard/coherency.h
  69. 3 3
      src/datawizard/copy_driver.c
  70. 8 1
      src/datawizard/copy_driver.h
  71. 306 179
      src/datawizard/data_request.c
  72. 34 13
      src/datawizard/data_request.h
  73. 87 25
      src/datawizard/datawizard.c
  74. 8 7
      src/datawizard/datawizard.h
  75. 1 1
      src/datawizard/filters.c
  76. 5 2
      src/datawizard/interfaces/data_interface.c
  77. 10 0
      src/datawizard/malloc.c
  78. 7 0
      src/datawizard/malloc.h
  79. 21 11
      src/datawizard/memalloc.c
  80. 1 1
      src/datawizard/memalloc.h
  81. 3 4
      src/datawizard/memory_nodes.c
  82. 21 0
      src/datawizard/memory_nodes.h
  83. 11 2
      src/datawizard/reduction.c
  84. 11 8
      src/datawizard/user_interactions.c
  85. 2 2
      src/datawizard/write_back.c
  86. 2 2
      src/debug/latency.c
  87. 51 42
      src/debug/traces/starpu_fxt.c
  88. 3 2
      src/drivers/cpu/driver_cpu.c
  89. 4 36
      src/drivers/cuda/driver_cuda.c
  90. 3 3
      src/drivers/mp_common/source_common.c
  91. 4 4
      src/drivers/opencl/driver_opencl.c
  92. 3 0
      src/profiling/profiling.c
  93. 1 1
      src/sched_policies/component_best_implementation.c
  94. 3 1
      src/sched_policies/component_fifo.c
  95. 3 3
      src/sched_policies/component_worker.c
  96. 25 2
      src/sched_policies/fifo_queues.c
  97. 1 0
      src/sched_policies/fifo_queues.h
  98. 2 2
      src/sched_policies/prio_deque.c
  99. 5 0
      src/sched_policies/work_stealing_policy.c
  100. 0 0
      src/util/execute_on_all.c

+ 10 - 1
.gitlab-ci.yml

@@ -30,7 +30,7 @@ build:
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
 
-deploy:
+check:
   stage: deploy
   script:
     - ./contrib/gitlab/deploy.sh
@@ -38,3 +38,12 @@ deploy:
     - if: '$CI_PIPELINE_SOURCE == "push"'
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
+
+simgrid:
+  stage: deploy
+  script:
+    - ./contrib/gitlab/simgrid.sh
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "push"'
+      when: never  # Prevent pipeline run for push event
+    - when: always # Run pipeline for all other cases

+ 1 - 0
AUTHORS

@@ -17,6 +17,7 @@ Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 He Kun, Inria, <kun.he@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
+Jego Antoine, Enseeiht, <antoine.jego@etu.enseeiht.fr>
 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
 Juven Alexis, Inria, <alexis.juven@inria.fr>
 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>

+ 3 - 0
ChangeLog

@@ -51,9 +51,11 @@ New features:
     starpu_mpi_interface_datatype_node_register which will be needed for
     MPI/NUMA/GPUDirect.
   * Add peek_data interface method.
+  * Add STARPU_MPI_REDUX
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
+  * Make reduction methods want the commute flag.
 
 StarPU 1.3.8
 ====================================================================
@@ -67,6 +69,7 @@ Small features:
     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
     instead of hyperthreads.
   * New STARPU_TASK_PROGRESS environment variable to show task progression.
+  * Add STARPU_SIMGRID environment variable guard against native builds.
 
 StarPU 1.3.7
 ====================================================================

+ 2 - 0
Makefile.am

@@ -53,9 +53,11 @@ if STARPU_BUILD_STARPURM
 SUBDIRS += starpurm
 endif
 
+if STARPU_USE_CPU
 if STARPU_BUILD_STARPUPY
 SUBDIRS += starpupy
 endif
+endif
 
 if STARPU_BUILD_SC_HYPERVISOR
 SUBDIRS += sc_hypervisor

+ 21 - 16
configure.ac

@@ -167,9 +167,8 @@ if test x$enable_simgrid = xyes ; then
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
 	fi
-	if test -n "$SIMGRID_LIBS" ; then
-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
-	fi
+	SAVED_LIBS="${LIBS}"
+	LIBS="$SIMGRID_LIBS $LIBS"
 	AC_HAVE_LIBRARY([simgrid], [],
 		[
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
@@ -207,6 +206,7 @@ if test x$enable_simgrid = xyes ; then
 
 	# Oldies for compatibility with older simgrid
 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
+	LIBS="${SAVED_LIBS}"
 
 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
 	# We won't bind or detect anything
@@ -225,6 +225,7 @@ if test x$enable_simgrid = xyes ; then
 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
 		LIBS="$LIBS -lstdc++"
 	fi
+	SIMGRID_LDFLAGS="$SIMGRID_LIBS -lsimgrid"
 
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	case \ $CXXFLAGS\  in
@@ -267,13 +268,13 @@ if test x$enable_simgrid = xyes ; then
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
 		# libsimgrid needs to be linked from binaries themselves for MC to work
-		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $SIMGRID_LDFLAGS"
 	fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
-AC_SUBST(SIMGRID_LIBS)
+AC_SUBST(SIMGRID_LDFLAGS)
 AC_MSG_CHECKING(whether SimGrid is enabled)
 AC_MSG_RESULT($enable_simgrid)
 
@@ -2304,9 +2305,6 @@ if test x$maxnodes = x0 ; then
 	if test x$enable_simgrid = xyes ; then
 		# We need the room for the virtual CUDA/OpenCL devices
 		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
-		if test $nodes -gt 32 ; then
-			nodes=32
-		fi
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
@@ -2342,8 +2340,7 @@ if test x$maxnodes = x0 ; then
 	done
 fi
 if test $maxnodes -gt 32 ; then
-	# FIXME: at least use uint64 so we can have 64 memory nodes
-	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
+	AC_MSG_WARN([Note: the wt_mask feature only supports 32 memory nodes])
 fi
 
 AC_MSG_CHECKING(maximum number of memory nodes)
@@ -3448,6 +3445,14 @@ then
 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
 	fi
 	AC_SUBST(PYTHON)
+	PYTHON_INCLUDE_DIRS="`$PYTHON -c "from sysconfig import get_paths as gp; print(gp()@<:@'include'@:>@)"`"
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	CPPFLAGS="$CPPFLAGS -I$PYTHON_INCLUDE_DIRS"
+	AC_CHECK_HEADERS([Python.h],[have_python_h=yes],[have_python_h=no])
+	if test "$have_python_h" = "no" ; then
+		AC_MSG_ERROR([Python.h missing, cannot build StarPU python interface (consider installing python-dev)])
+	fi
+	CPPFLAGS=${SAVED_CPPFLAGS}
 	AC_MSG_CHECKING(for python3 module joblib)
 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
 	AC_MSG_RESULT($joblib_avail)
@@ -3565,7 +3570,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 AC_SUBST([STARPU_H_CPPFLAGS])
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
@@ -3805,11 +3810,11 @@ AC_MSG_NOTICE([
 	       OpenMP runtime support enabled:                $enable_openmp
 	       Cluster support enabled:                       $enable_cluster
 	       SOCL enabled:                                  $build_socl
-               SOCL test suite:                               $run_socl_check
-               Scheduler Hypervisor:                          $build_sc_hypervisor
-               simgrid enabled:                               $enable_simgrid
-               ayudame enabled:                               $ayu_msg
-               HDF5 enabled:                                  $enable_hdf5
+	       SOCL test suite:                               $run_socl_check
+	       Scheduler Hypervisor:                          $build_sc_hypervisor
+	       simgrid enabled:                               $enable_simgrid
+	       ayudame enabled:                               $ayu_msg
+	       HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr

+ 18 - 3
contrib/ci.inria.fr/job-1-check.sh

@@ -37,7 +37,11 @@ basename=$(basename $tarball .tar.gz)
 export STARPU_HOME=$PWD/$basename/home
 mkdir -p $basename
 cd $basename
-env > $PWD/env
+(
+    echo "oldPWD=\${PWD}"
+    env|grep -v LS_COLORS | grep '^[A-Z]'|grep -v BASH_FUNC | grep '=' | sed 's/=/=\"/'| sed 's/$/\"/' | sed 's/^/export /'
+    echo "cd \$oldPWD"
+) > ${PWD}/env
 
 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
 tar xfz ../$tarball
@@ -63,7 +67,17 @@ fi
 
 export CC=gcc
 
-CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
+set +e
+mpiexec -oversubscribe pwd 2>/dev/null
+ret=$?
+set -e
+ARGS=""
+if test "$ret" = "0"
+then
+    ARGS="--with-mpiexec-args=-oversubscribe"
+fi
+
+CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc $ARGS"
 CONFIGURE_CHECK=""
 day=$(date +%u)
 if test $day -le 5
@@ -72,10 +86,11 @@ then
 #else
     # we do a normal check, a long check takes too long on VM nodes
 fi
-../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
+../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS $STARPU_USER_CONFIGURE_OPTIONS
 
 export STARPU_TIMEOUT_ENV=1800
 export MPIEXEC_TIMEOUT=1800
+
 make
 #make check
 (make -k check || true) 2>&1 | tee  ../check_$$

+ 22 - 0
contrib/gitlab/simgrid.sh

@@ -0,0 +1,22 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+STARPU_USER_CONFIGURE_OPTIONS="--enable-simgrid --disable-mpi --disable-mpi-check" ./contrib/ci.inria.fr/job-1-check.sh
+
+
+
+

+ 1 - 1
doc/doxygen/chapters/101_building.doxy

@@ -520,7 +520,7 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 using the SimGrid version of StarPU: first install the SimGrid simulator from
 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
-3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with \c -Denable_msg=ON.
 Other versions may have compatibility issues, 3.17 notably does
 not build at all. MPI simulation does not work with version 3.22).
 Then configure StarPU with \ref enable-simgrid

+ 2 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -643,7 +643,8 @@ struct starpu_codelet accumulate_variable_cl =
         .cpu_funcs = { accumulate_variable_cpu },
         .cpu_funcs_name = { "accumulate_variable_cpu" },
         .cuda_funcs = { accumulate_variable_cuda },
-        .nbuffers = 1,
+        .nbuffers = 2,
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 }
 \endcode
 

文件差異過大導致無法顯示
+ 22 - 34
doc/doxygen/chapters/320_scheduling.doxy


+ 4 - 4
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -515,12 +515,12 @@ The <c>-f</c> option can also be used to display the performance in terms of GFl
 
 \verbatim
 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
-$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
-$ gv starpu_non_linear_memset_regression_based_energy.eps
+$ gnuplot starpu_gflops_non_linear_memset_regression_based_energy.gp
+$ gv starpu_gflops_non_linear_memset_regression_based_energy.eps
 \endverbatim
 
-\image html starpu_non_linear_memset_regression_based_energy_flops.png
-\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
+\image html starpu_gflops_non_linear_memset_regression_based_energy.png
+\image latex starpu_gflops_non_linear_memset_regression_based_energy.eps "" width=\textwidth
 
 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
 

+ 31 - 0
doc/doxygen/chapters/410_mpi_support.doxy

@@ -744,6 +744,37 @@ starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
 
 The data can then be used just like pernode above.
 
+\section MPIMpiRedux Inter-node reduction
+
+One might want to leverage a reduction pattern across several nodes.
+Using \c STARPU_REDUX, one can obtain reduction patterns across several nodes,
+however each core across the contributing nodes will spawn their own
+contribution to work with. In the case that these allocations or the
+required reductions are too expensive to execute for each contribution,
+the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution 
+on node executing tasks partaking in the reduction.
+
+Tasks producing a result in the inter-node reduction should be registered as
+accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode.
+
+\code{.c}
+static struct starpu_codelet contrib_cl =
+{
+	.cpu_funcs = {cpu_contrib}, /* cpu implementation(s) of the routine */
+	.nbuffers = 1, /* number of data handles referenced by this routine */
+	.modes = {STARPU_RW | STARPU_COMMUTE} /* access modes for the contribution */
+	.name = "contribution"
+};
+\endcode
+
+When inserting these tasks, the access mode handed out to the StarPU-MPI layer
+should be \c STARPU_MPI_REDUX. Assuming \c data is owned by node 0 and we want node
+1 to compute the contribution, we could do the following.
+
+\code{.c}
+starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, EXECUTE_ON_NODE, 1); /* Node 1 computes it */
+\endcode
+
 \section MPIPriorities Priorities
 
 All send functions have a <c>_prio</c> variant which takes an additional

+ 31 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -473,6 +473,16 @@ todo
 todo
 </dd>
 
+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
+<dd>
+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+Specify if CUDA workers should do only fast allocations
+when running the datawizard progress of
+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+Default value is 0, allowing CUDA workers to do slow allocations.
+</dd>
+
 </dl>
 
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
@@ -738,6 +748,27 @@ block when the memory allocation required for network reception overflows the
 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
 </dd>
 
+<dt>STARPU_MPI_EARLYDATA_ALLOCATE</dt>
+<dd>
+\anchor STARPU_MPI_EARLYDATA_ALLOCATE
+\addindex __env__STARPU_MPI_EARLYDATA_ALLOCATE
+When set to 1, the MPI Driver will immediately allocate the data for early
+requests instead of issuing a data request and blocking. The default value is 0,
+issuing a data request. Because it is an early request and we do not know its
+real priority, the data request will assume \ref STARPU_DEFAULT_PRIO. In cases
+where there are many data requests with priorities greater than
+\ref STARPU_DEFAULT_PRIO the MPI drive could be blocked for long periods.
+</dd>
+
+<dt>STARPU_SIMGRID</dt>
+<dd>
+\anchor STARPU_SIMGRID
+\addindex __env__STARPU_SIMGRID
+When set to 1 (the default is 0), this makes StarPU check that it was really
+build with simulation support. This is convenient in scripts to avoid using a
+native version, that would try to update performance models...
+</dd>
+
 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
 <dd>
 \anchor STARPU_SIMGRID_TRANSFER_COST

+ 1 - 1
doc/doxygen/chapters/code/disk_copy.c

@@ -33,7 +33,7 @@
 
 int main(int argc, char **argv)
 {
-	double * A,*B,*C,*D,*E,*F;
+	double *A, *F;
 
 	/* limit main ram to force to push in disk */
 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);

doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png


+ 1 - 1
examples/Makefile.am

@@ -106,6 +106,7 @@ examplebin_PROGRAMS =
 noinst_HEADERS = 				\
 	axpy/axpy.h                             \
 	cg/cg.h					\
+	cg/cg_kernels.c				\
 	heat/lu_kernels_model.h			\
 	heat/dw_sparse_cg.h			\
 	heat/heat.h				\
@@ -869,7 +870,6 @@ if !STARPU_NO_BLAS_LIB
 
 cg_cg_SOURCES =					\
 	cg/cg.c					\
-	cg/cg_kernels.c				\
 	common/blas.c
 
 cg_cg_LDADD =					\

+ 6 - 2
examples/basic_examples/multiformat_conversion_codelets.c

@@ -41,6 +41,7 @@ struct starpu_codelet cpu_to_cuda_cl =
 	.cuda_funcs = {cpu_to_cuda_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cpu_to_cuda"
 };
 
@@ -48,6 +49,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 {
 	.cpu_funcs = {cuda_to_cpu},
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cude_to_cpu"
 };
 #endif
@@ -73,12 +75,14 @@ struct starpu_codelet cpu_to_opencl_cl =
 {
 	.opencl_funcs = {cpu_to_opencl_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 
 struct starpu_codelet opencl_to_cpu_cl =
 {
 	.cpu_funcs = {opencl_to_cpu},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 #endif

+ 54 - 158
examples/cg/cg.c

@@ -19,11 +19,6 @@
 #include <starpu.h>
 #include <common/blas.h>
 
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#endif
-
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 /*
  *	Conjugate Gradient
@@ -68,32 +63,34 @@
 
 #include "cg.h"
 
-static int long long n = 4096;
-static int nblocks = 8;
-static int use_reduction = 1;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks);
 
-static starpu_data_handle_t A_handle, b_handle, x_handle;
-static TYPE *A, *b, *x;
+#define HANDLE_TYPE_VECTOR starpu_data_handle_t
+#define HANDLE_TYPE_MATRIX starpu_data_handle_t
+#define TASK_INSERT(cl, ...) starpu_task_insert(cl, ##__VA_ARGS__)
+#define GET_VECTOR_BLOCK(v, i) starpu_data_get_sub_data(v, 1, i)
+#define GET_MATRIX_BLOCK(m, i, j) starpu_data_get_sub_data(m, 2, i, j)
+#define BARRIER()
+#define GET_DATA_HANDLE(handle)
+#define FPRINTF_SERVER FPRINTF
+
+#include "cg_kernels.c"
 
-#ifdef STARPU_QUICK_CHECK
-static int i_max = 5;
-#elif !defined(STARPU_LONG_CHECK)
-static int i_max = 100;
-#else
-static int i_max = 1000;
-#endif
-static double eps = (10e-14);
 
-static starpu_data_handle_t r_handle, d_handle, q_handle;
+
+static TYPE *A, *b, *x;
 static TYPE *r, *d, *q;
 
-static starpu_data_handle_t dtq_handle, rtr_handle;
-static TYPE dtq, rtr;
 
-extern struct starpu_codelet accumulate_variable_cl;
-extern struct starpu_codelet accumulate_vector_cl;
-extern struct starpu_codelet bzero_variable_cl;
-extern struct starpu_codelet bzero_vector_cl;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+{
+	unsigned b;
+
+	for (b = 0; b < nblocks; b++)
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}
+
 
 /*
  *	Generate Input data
@@ -264,162 +261,48 @@ static void display_matrix(void)
 }
 #endif
 
-/*
- *	Main loop
- */
-
-static int cg(void)
+static void display_x_result(void)
 {
-	double delta_new, delta_0;
-
-	int i = 0;
-	int ret;
+	int j, i;
+	starpu_data_handle_t sub;
 
-	/* r <- b */
-	ret = copy_handle(r_handle, b_handle, nblocks);
-	if (ret == -ENODEV) return ret;
+	FPRINTF(stderr, "Computed X vector:\n");
 
-	/* r <- r - A x */
-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
+	int block_size = n / nblocks;
 
-	/* d <- r */
-	ret = copy_handle(d_handle, r_handle, nblocks);
-	if (ret == -ENODEV) return ret;
-
-	/* delta_new = dot(r,r) */
-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
-
-	starpu_data_acquire(rtr_handle, STARPU_R);
-	delta_new = rtr;
-	delta_0 = delta_new;
-	starpu_data_release(rtr_handle);
-
-	FPRINTF(stderr, "*************** INITIAL ************ \n");
-	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
-
-	double start;
-	double end;
-	start = starpu_timing_now();
-
-	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	for (j = 0; j < nblocks; j++)
 	{
-		double delta_old;
-		double alpha, beta;
-
-		starpu_iteration_push(i);
-
-		/* q <- A d */
-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
-
-		/* dtq <- dot(d,q) */
-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
-
-		/* alpha = delta_new / dtq */
-		starpu_data_acquire(dtq_handle, STARPU_R);
-		alpha = delta_new/dtq;
-		starpu_data_release(dtq_handle);
-
-		/* x <- x + alpha d */
-		axpy_kernel(x_handle, d_handle, alpha, nblocks);
-
-		if ((i % 50) == 0)
-		{
-			/* r <- b */
-			copy_handle(r_handle, b_handle, nblocks);
-
-			/* r <- r - A x */
-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-		}
-		else
-		{
-			/* r <- r - alpha q */
-			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
-		}
-
-		/* delta_new = dot(r,r) */
-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-
-		starpu_data_acquire(rtr_handle, STARPU_R);
-		delta_old = delta_new;
-		delta_new = rtr;
-		beta = delta_new / delta_old;
-		starpu_data_release(rtr_handle);
-
-		/* d <- beta d + r */
-		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
-
-		if ((i % 10) == 0)
+		sub = starpu_data_get_sub_data(x_handle, 1, j);
+		starpu_data_acquire(sub, STARPU_R);
+		for (i = 0; i < block_size; i++)
 		{
-			/* We here take the error as ||r||_2 / (n||b||_2) */
-			double error = sqrt(delta_new/delta_0)/(1.0*n);
-			FPRINTF(stderr, "*****************************************\n");
-			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+			FPRINTF(stderr, "% 02.2e\n", x[j*block_size + i]);
 		}
-
-		starpu_iteration_pop();
-		i++;
+		starpu_data_release(sub);
 	}
-
-	end = starpu_timing_now();
-
-	double timing = end - start;
-	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
-	return 0;
 }
 
-static int check(void)
-{
-	return 0;
-}
 
 static void parse_args(int argc, char **argv)
 {
 	int i;
 	for (i = 1; i < argc; i++)
 	{
-	        if (strcmp(argv[i], "-n") == 0)
-		{
-			n = (int long long)atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-maxiter") == 0)
-		{
-			i_max = atoi(argv[++i]);
-			if (i_max <= 0)
-			{
-				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
-				exit(EXIT_FAILURE);
-			}
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-nblocks") == 0)
-		{
-			nblocks = atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-no-reduction") == 0)
-		{
-			use_reduction = 0;
-			continue;
-		}
-
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
-			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 		}
-        }
+	}
+
+	parse_common_args(argc, argv);
 }
 
+
 int main(int argc, char **argv)
 {
 	int ret;
+	double start, end;
 
 	/* Not supported yet */
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
@@ -434,9 +317,19 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
+	FPRINTF(stderr, "************** PARAMETERS ***************\n");
+	FPRINTF(stderr, "Problem size (-n): %lld\n", n);
+	FPRINTF(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
+	FPRINTF(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
+	FPRINTF(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
+
+	start = starpu_timing_now();
 	generate_random_problem();
 	register_data();
 	partition_data();
+	end = starpu_timing_now();
+
+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/10e6);
 
 	ret = cg();
 	if (ret == -ENODEV)
@@ -445,10 +338,13 @@ int main(int argc, char **argv)
 		goto enodev;
 	}
 
-	ret = check();
-
 	starpu_task_wait_for_all();
 
+	if (display_result)
+	{
+		display_x_result();
+	}
+
 enodev:
 	unregister_data();
 	free_data();

+ 0 - 25
examples/cg/cg.h

@@ -54,29 +54,4 @@
 #define cublasscal	cublasSscal
 #endif
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
-	       starpu_data_handle_t s,
-	       unsigned nblocks,
-	       int use_reduction);
-
-int gemv_kernel(starpu_data_handle_t v1,
-                starpu_data_handle_t matrix, 
-                starpu_data_handle_t v2,
-                TYPE p1, TYPE p2,
-		unsigned nblocks,
-		int use_reduction);
-
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
-		unsigned nblocks);
-
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
-		     unsigned nblocks);
-
-int copy_handle(starpu_data_handle_t dst,
-		starpu_data_handle_t src,
-		unsigned nblocks);
-
 #endif /* __STARPU_EXAMPLE_CG_H__ */

+ 216 - 37
examples/cg/cg_kernels.c

@@ -23,11 +23,43 @@
 #include <limits.h>
 
 #ifdef STARPU_USE_CUDA
+#include <cuda.h>
 #include <starpu_cublas_v2.h>
 static const TYPE gp1 = 1.0;
 static const TYPE gm1 = -1.0;
 #endif
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+static int nblocks = 8;
+
+#ifdef STARPU_QUICK_CHECK
+static int i_max = 5;
+static int long long n = 2048;
+#elif !defined(STARPU_LONG_CHECK)
+static int long long n = 4096;
+static int i_max = 100;
+#else
+static int long long n = 4096;
+static int i_max = 1000;
+#endif
+static double eps = (10e-14);
+
+int use_reduction = 1;
+int display_result = 0;
+
+HANDLE_TYPE_MATRIX A_handle;
+HANDLE_TYPE_VECTOR b_handle;
+HANDLE_TYPE_VECTOR x_handle;
+
+HANDLE_TYPE_VECTOR r_handle;
+HANDLE_TYPE_VECTOR d_handle;
+HANDLE_TYPE_VECTOR q_handle;
+
+starpu_data_handle_t dtq_handle;
+starpu_data_handle_t rtr_handle;
+TYPE dtq, rtr;
+
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 {
@@ -120,7 +152,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cuda_funcs = {accumulate_variable_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
 };
@@ -164,7 +196,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cuda_funcs = {accumulate_vector_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
 };
@@ -314,8 +346,8 @@ static struct starpu_codelet dot_kernel_cl =
 	.model = &dot_kernel_model
 };
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
+int dot_kernel(HANDLE_TYPE_VECTOR v1,
+	       HANDLE_TYPE_VECTOR v2,
 	       starpu_data_handle_t s,
 	       unsigned nblocks,
 	       int use_reduction)
@@ -327,21 +359,21 @@ int dot_kernel(starpu_data_handle_t v1,
 		starpu_data_invalidate_submit(s);
 	else
 	{
-		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
+		ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_task_insert(&dot_kernel_cl,
+		ret = TASK_INSERT(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
-					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
@@ -477,9 +509,9 @@ static struct starpu_codelet gemv_kernel_cl =
 	.model = &gemv_kernel_model
 };
 
-int gemv_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t matrix,
-		starpu_data_handle_t v2,
+int gemv_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_MATRIX matrix,
+		HANDLE_TYPE_VECTOR v2,
 		TYPE p1, TYPE p2,
 		unsigned nblocks,
 		int use_reduction)
@@ -489,13 +521,13 @@ int gemv_kernel(starpu_data_handle_t v1,
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		ret = starpu_task_insert(&scal_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+		ret = TASK_INSERT(&scal_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -503,15 +535,15 @@ int gemv_kernel(starpu_data_handle_t v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			ret = starpu_task_insert(&gemv_kernel_cl,
-						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+			ret = TASK_INSERT(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	GET_VECTOR_BLOCK(v1, b2),
+						 STARPU_R,	GET_MATRIX_BLOCK(matrix, b2, b1),
+						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		}
 	}
 	return 0;
@@ -582,23 +614,23 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.model = &scal_axpy_kernel_model
 };
 
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
+int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
+		     HANDLE_TYPE_VECTOR v2, TYPE p2,
 		     unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
 		int ret;
-		ret = starpu_task_insert(&scal_axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&scal_axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
@@ -661,30 +693,177 @@ static struct starpu_codelet axpy_kernel_cl =
 	.model = &axpy_kernel_model
 };
 
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
+int axpy_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_VECTOR v2, TYPE p1,
 		unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
 		int ret;
-		ret = starpu_task_insert(&axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
 
-int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+
+/*
+ *	Main loop
+ */
+int cg(void)
 {
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	TYPE delta_new, delta_0, error, delta_old, alpha, beta;
+	double start, end, timing;
+	int i = 0, ret;
+
+	/* r <- b */
+	ret = copy_handle(r_handle, b_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* r <- r - A x */
+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+	if (ret == -ENODEV) return ret;
+
+	/* d <- r */
+	ret = copy_handle(d_handle, r_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* delta_new = dot(r,r) */
+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+	if (ret == -ENODEV) return ret;
+
+	GET_DATA_HANDLE(rtr_handle);
+	starpu_data_acquire(rtr_handle, STARPU_R);
+	delta_new = rtr;
+	delta_0 = delta_new;
+	starpu_data_release(rtr_handle);
+
+	FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
+
+	FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
+	FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
+
+	BARRIER();
+	start = starpu_timing_now();
+
+	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	{
+		starpu_iteration_push(i);
+
+		/* q <- A d */
+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
+
+		/* dtq <- dot(d,q) */
+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
+
+		/* alpha = delta_new / dtq */
+		GET_DATA_HANDLE(dtq_handle);
+		starpu_data_acquire(dtq_handle, STARPU_R);
+		alpha = delta_new / dtq;
+		starpu_data_release(dtq_handle);
+
+		/* x <- x + alpha d */
+		axpy_kernel(x_handle, d_handle, alpha, nblocks);
+
+		if ((i % 50) == 0)
+		{
+			/* r <- b */
+			copy_handle(r_handle, b_handle, nblocks);
+
+			/* r <- r - A x */
+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+		}
+		else
+		{
+			/* r <- r - alpha q */
+			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
+		}
+
+		/* delta_new = dot(r,r) */
+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+
+		GET_DATA_HANDLE(rtr_handle);
+		starpu_data_acquire(rtr_handle, STARPU_R);
+		delta_old = delta_new;
+		delta_new = rtr;
+		beta = delta_new / delta_old;
+		starpu_data_release(rtr_handle);
+
+		/* d <- beta d + r */
+		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
+
+		if ((i % 10) == 0)
+		{
+			/* We here take the error as ||r||_2 / (n||b||_2) */
+			error = sqrt(delta_new/delta_0)/(1.0*n);
+			FPRINTF_SERVER(stderr, "*****************************************\n");
+			FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+		}
+
+		starpu_iteration_pop();
+		i++;
+	}
+
+	BARRIER();
+	end = starpu_timing_now();
+	timing = end - start;
+
+	error = sqrt(delta_new/delta_0)/(1.0*n);
+	FPRINTF_SERVER(stderr, "*****************************************\n");
+	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
+
 	return 0;
 }
+
+
+void parse_common_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-n") == 0)
+		{
+			n = (int long long)atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-display-result") == 0)
+		{
+			display_result = 1;
+			continue;
+		}
+
+		if (strcmp(argv[i], "-maxiter") == 0)
+		{
+			i_max = atoi(argv[++i]);
+			if (i_max <= 0)
+			{
+				FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
+				exit(EXIT_FAILURE);
+			}
+			continue;
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			nblocks = atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-no-reduction") == 0)
+		{
+			use_reduction = 0;
+			continue;
+		}
+	}
+}

+ 1 - 1
examples/pi/pi_redux.c

@@ -322,7 +322,7 @@ static struct starpu_codelet redux_codelet =
 	.cuda_funcs = {redux_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 1 - 1
examples/reductions/dot_product.c

@@ -211,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 	.opencl_funcs = {redux_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 {
 	.cpu_funcs = {minmax_redux_cpu_func},
 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };

+ 13 - 0
include/fstarpu_mod.f90

@@ -25,6 +25,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_RW
         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
         type(c_ptr), bind(C) :: FSTARPU_REDUX
+        type(c_ptr), bind(C) :: FSTARPU_MPI_REDUX
         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
         type(c_ptr), bind(C) :: FSTARPU_SSEND
         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
@@ -36,11 +37,15 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
@@ -2395,6 +2400,7 @@ module fstarpu_mod
                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
+                        FSTARPU_MPI_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_MPI_REDUX"//C_NULL_CHAR)
                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
@@ -2406,12 +2412,19 @@ module fstarpu_mod
                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_WITH_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_ARG_NFREE    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE   = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)

+ 8 - 0
include/starpu.h

@@ -471,6 +471,14 @@ struct starpu_conf
 	   Maximum spinning backoff of drivers. Default value: \c 32
 	 */
 	unsigned driver_spinning_backoff_max;
+
+	/**
+	   Specify if CUDA workers should do only fast allocations
+	   when running the datawizard progress of
+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+	   Default value is 0, allowing CUDA workers to do slow allocations.
+	 */
+	int cuda_only_fast_alloc_other_memnodes;
 };
 
 /**

+ 14 - 4
include/starpu_data.h

@@ -110,7 +110,15 @@ enum starpu_data_access_mode
 				   src/sched_policies/work_stealing_policy.c
 				   source code.
 				*/
-	STARPU_ACCESS_MODE_MAX=(1<<7) /**< todo */
+	STARPU_MPI_REDUX=(1<<7), /** Inter-node reduction only. Codelets 
+				    contributing to these reductions should
+				    be registered with STARPU_RW | STARPU_COMMUTE 
+				    access modes.
+			            When inserting these tasks through the
+				    MPI layer however, the access mode needs
+				    to be STARPU_MPI_REDUX. */
+	STARPU_ACCESS_MODE_MAX=(1<<8) /** The purpose of ACCESS_MODE_MAX is to
+					be the maximum of this enum. */
 };
 
 struct starpu_data_interface_ops;
@@ -305,7 +313,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
    This is a very internal interface, subject to changes, do not use this.
 */
-int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid);
+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid, int prio);
 
 /**
    The application can call this function instead of starpu_data_acquire() so as to
@@ -560,8 +568,10 @@ struct starpu_codelet;
 /**
    Set the codelets to be used for \p handle when it is accessed in the
    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
-   the codelet \p init_cl, and reduction between per-worker buffers will be
-   done with the codelet \p redux_cl.
+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
+   reduction between per-worker buffers will be done with the codelet \p
+   redux_cl (which has to take a first accumulation handle with
+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
 */
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 

+ 8 - 0
include/starpu_hash.h

@@ -39,6 +39,14 @@ extern "C"
 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
 
 /**
+   Compute the CRC of a pointer value seeded by the \p inputcrc
+   <em>current state</em>. The return value should be considered as the new
+   <em>current state</em> for future CRC computation. This is used for computing
+   data size footprint.
+*/
+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc);
+
+/**
    Compute the CRC of a 32bit number seeded by the \p inputcrc
    <em>current state</em>. The return value should be considered as the new
    <em>current state</em> for future CRC computation. This is used for computing

+ 3 - 5
include/starpu_perfmodel.h

@@ -310,10 +310,10 @@ struct starpu_perfmodel
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 
 /**
-   Deinitialize the \p model performance model structure. You need to call this 
-   before deallocating the structure. You will probably want to call 
+   Deinitialize the \p model performance model structure. You need to call this
+   before deallocating the structure. You will probably want to call
    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
-*/   
+*/
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 /**
@@ -322,7 +322,6 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration will be run
 */
-
 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
 /**
@@ -335,7 +334,6 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration was run
 */
-
 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
 
 

+ 28 - 2
include/starpu_task.h

@@ -861,7 +861,28 @@ struct starpu_task
 	*/
 	void *prologue_callback_arg;
 
+	/** Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void*)</c>
+	   which specifies a possible callback. If this pointer is
+	   non-<c>NULL</c>, the callback function is executed on the host
+	   when the task is pop-ed from the scheduler, just before getting
+	   executed. The callback is passed the value contained in the
+	   starpu_task::prologue_callback_pop_arg field.
+	   No callback is executed if the field is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP followed by the function pointer.
+	*/
 	void (*prologue_callback_pop_func)(void *);
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is
+	   the pointer passed to the prologue_callback_pop function. This
+	   field is ignored if the field
+	   starpu_task::prologue_callback_pop_func is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG followed by the argument.
+	   */
 	void *prologue_callback_pop_arg;
 
 	/**
@@ -1424,8 +1445,13 @@ struct starpu_task
 	do {								\
 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
-		else							\
-			STARPU_CODELET_SET_MODE((task)->cl, mode, i);	\
+		else \
+		{							\
+			enum starpu_data_access_mode cl_mode = STARPU_CODELET_GET_MODE((task)->cl, i); \
+			STARPU_ASSERT_MSG(cl_mode == mode,	\
+				"Task <%s> can't set its  %d-th buffer mode to %d as the codelet it derives from uses %d", \
+				(task)->cl->name, i, mode, cl_mode);	\
+		} \
 	} while(0)
 
 /**

+ 4 - 0
include/starpu_util.h

@@ -257,6 +257,10 @@ extern "C"
 	_starpu_abort();				\
 } while(0)
 
+#if defined(_MSC_VER)
+  #undef STARPU_HAVE_STRERROR_R
+#endif
+
 #if defined(STARPU_HAVE_STRERROR_R)
 #if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
 /* XSI-compliant version of strerror_r returns an int */

+ 2 - 2
julia/README

@@ -20,8 +20,8 @@ $ make
 Then, you need to add the lib/ directory to your library path and the julia/
 directory to your Julia load path:
 
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
-$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/src/.lib
+$ export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
 
 This step can also be done by sourcing the setenv.sh script:
 

+ 2 - 2
julia/examples/execute.sh.in

@@ -1,7 +1,7 @@
 #!@REALBASH@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,7 +16,7 @@
 #
 
 set -x
-export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia/src:$JULIA_LOAD_PATH
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
 export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3

+ 2 - 2
julia/setenv.sh

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -13,7 +13,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
+export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
 
 if [ `uname` == "Darwin" ]; then
     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$PWD/lib/

+ 1 - 1
julia/src/StarPU.jl

@@ -65,7 +65,7 @@ export STARPU_HISTORY_BASED, STARPU_REGRESSION_BASED
 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
 export starpu_tag_t
 export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
-export STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
+export STARPU_MPI_REDUX, STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
 export STARPU_ACCESS_MODE_MAX
 
 # BLAS

+ 89 - 2
mpi/examples/Makefile.am

@@ -272,9 +272,27 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky_distributed
 endif
 
-########################
+##############
+# CG example #
+##############
+
+if !STARPU_SIMGRID
+if !STARPU_NO_BLAS_LIB
+examplebin_PROGRAMS += cg/cg
+starpu_mpi_EXAMPLES += cg/cg
+
+cg_cg_SOURCES =					\
+	cg/cg.c						\
+	../../examples/common/blas.c
+
+cg_cg_LDADD =					\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+endif
+
+###########################
 # MPI Matrix mult example #
-########################
+###########################
 
 examplebin_PROGRAMS +=		\
 	matrix_mult/mm
@@ -290,6 +308,24 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_mult/mm
 endif
 
+########################
+# MPI STARPU_MPI_REDUX #
+########################
+
+examplebin_PROGRAMS +=		\
+	mpi_redux/mpi_redux
+
+mpi_redux_mpi_redux_SOURCES	=		\
+	mpi_redux/mpi_redux.c
+
+mpi_redux_mpi_redux_LDADD =			\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	mpi_redux/mpi_redux
+endif
+
 ##########################################
 # Native Fortran MPI Matrix mult example #
 ##########################################
@@ -336,6 +372,55 @@ endif
 endif
 endif
 
+########################################
+# Native Fortran MPI STARPU_REDUX test #
+########################################
+
+if STARPU_HAVE_MPIFORT
+if !STARPU_SANITIZE
+examplebin_PROGRAMS +=		\
+	native_fortran/nf_mpi_redux
+
+native_fortran_nf_mpi_redux_SOURCES	=			\
+	native_fortran/fstarpu_mpi_mod.f90	\
+	native_fortran/fstarpu_mod.f90		\
+	native_fortran/nf_mpi_redux.f90	
+
+native_fortran_nf_mpi_redux_LDADD =					\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	native_fortran/nf_mpi_redux
+endif
+endif
+endif
+
+########################################
+# Native Fortran MPI STARPU_REDUX test #
+########################################
+
+if STARPU_HAVE_MPIFORT
+if !STARPU_SANITIZE
+examplebin_PROGRAMS +=		\
+	native_fortran/nf_redux_test
+
+native_fortran_nf_redux_test_SOURCES	=			\
+	native_fortran/fstarpu_mpi_mod.f90	\
+	native_fortran/fstarpu_mod.f90		\
+	native_fortran/nf_redux_test.f90	
+
+native_fortran_nf_redux_test_LDADD =					\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	native_fortran/nf_redux_test
+endif
+endif
+endif
+
+
 ###################
 # complex example #
 ###################
@@ -427,6 +512,8 @@ native_fortran/nf_mm_cl.o: fstarpu_mod.mod
 native_fortran/nf_mm.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
+native_fortran/nf_redux_test.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
+native_fortran/nf_mpi_redux.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
 

+ 422 - 0
mpi/examples/cg/cg.c

@@ -0,0 +1,422 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <assert.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <common/blas.h>
+
+/*
+ * Distributed version of Conjugate Gradient implemented in examples/cg/cg.c
+ *
+ * Use -display-result option and compare with the non-distributed version: the
+ * x vector should be the same.
+ */
+
+#include "../../../examples/cg/cg.h"
+
+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks);
+
+#define HANDLE_TYPE_VECTOR starpu_data_handle_t*
+#define HANDLE_TYPE_MATRIX starpu_data_handle_t**
+#define TASK_INSERT(cl, ...) starpu_mpi_task_insert(MPI_COMM_WORLD, cl, ##__VA_ARGS__)
+#define GET_VECTOR_BLOCK(v, i) v[i]
+#define GET_MATRIX_BLOCK(m, i, j) m[i][j]
+#define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
+#define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
+
+static int block_size;
+
+static int rank;
+static int nodes_p = 2;
+static int nodes_q;
+
+static TYPE ***A;
+static TYPE **x;
+static TYPE **b;
+
+static TYPE **r;
+static TYPE **d;
+static TYPE **q;
+
+#define FPRINTF_SERVER(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT") && rank == 0) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#include "../../../examples/cg/cg_kernels.c"
+
+static int my_distrib(const int y, const int x)
+{
+	return (y%nodes_q)*nodes_p + (x%nodes_p);
+}
+
+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks)
+{
+	unsigned b;
+
+	for (b = 0; b < nblocks; b++)
+	{
+		if (rank == my_distrib(b, 0))
+		{
+			starpu_data_cpy(dst[b], src[b], /* asynchronous */ 1, /* without callback */ NULL, NULL);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Generate Input data
+ */
+static void generate_random_problem(void)
+{
+	unsigned nn, mm, m, n, mpi_rank;
+
+	A = malloc(nblocks * sizeof(TYPE **));
+	x = malloc(nblocks * sizeof(TYPE *));
+	b = malloc(nblocks * sizeof(TYPE *));
+
+	r = malloc(nblocks * sizeof(TYPE *));
+	d = malloc(nblocks * sizeof(TYPE *));
+	q = malloc(nblocks * sizeof(TYPE *));
+
+	for (m = 0; m < nblocks; m++)
+	{
+		A[m] = malloc(nblocks * sizeof(TYPE*));
+
+		mpi_rank = my_distrib(m, 0);
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_malloc((void**) &x[m], block_size*sizeof(TYPE));
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_malloc((void**) &b[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &r[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &d[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &q[m], block_size*sizeof(TYPE));
+
+			for (mm = 0; mm < block_size; mm++)
+			{
+				x[m][mm] = (TYPE) 0.0;
+				b[m][mm] = (TYPE) 1.0;
+				r[m][mm] = (TYPE) 0.0;
+				d[m][mm] = (TYPE) 0.0;
+				q[m][mm] = (TYPE) 0.0;
+			}
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+			if (mpi_rank == rank)
+			{
+				starpu_malloc((void**) &A[m][n], block_size*block_size*sizeof(TYPE));
+
+				for (nn = 0; nn < block_size; nn++)
+				{
+					for (mm = 0; mm < block_size; mm++)
+					{
+						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
+						A[m][n][mm + nn*block_size] = (TYPE) (1.0/(1.0+(nn+(m*block_size)+mm+(n*block_size))));
+					}
+				}
+			}
+		}
+	}
+}
+
+static void free_data(void)
+{
+	unsigned nn, mm, m, n, mpi_rank;
+
+	for (m = 0; m < nblocks; m++)
+	{
+		mpi_rank = my_distrib(m, 0);
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_free((void*) x[m]);
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_free((void*) b[m]);
+			starpu_free((void*) r[m]);
+			starpu_free((void*) d[m]);
+			starpu_free((void*) q[m]);
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+			if (mpi_rank == rank)
+			{
+				starpu_free((void*) A[m][n]);
+			}
+		}
+
+		free(A[m]);
+	}
+
+	free(A);
+	free(x);
+	free(b);
+	free(r);
+	free(d);
+	free(q);
+}
+
+static void register_data(void)
+{
+	unsigned m, n;
+	int mpi_rank;
+	starpu_mpi_tag_t mpi_tag = 0;
+
+	A_handle = malloc(nblocks*sizeof(starpu_data_handle_t*));
+	x_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	b_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	r_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+	for (m = 0; m < nblocks; m++)
+	{
+		mpi_rank = my_distrib(m, 0);
+		A_handle[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_vector_data_register(&x_handle[m], STARPU_MAIN_RAM, (uintptr_t) x[m], block_size, sizeof(TYPE));
+		}
+		else if (!display_result)
+		{
+			assert(mpi_rank != rank);
+			starpu_vector_data_register(&x_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_vector_data_register(&b_handle[m], STARPU_MAIN_RAM, (uintptr_t) b[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[m], STARPU_MAIN_RAM, (uintptr_t) r[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[m], STARPU_MAIN_RAM, (uintptr_t) d[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[m], STARPU_MAIN_RAM, (uintptr_t) q[m], block_size, sizeof(TYPE));
+		}
+		else
+		{
+			starpu_vector_data_register(&b_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+		}
+
+		starpu_data_set_coordinates(x_handle[m], 1, m);
+		starpu_mpi_data_register(x_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(b_handle[m], 1, m);
+		starpu_mpi_data_register(b_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(r_handle[m], 1, m);
+		starpu_mpi_data_register(r_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(d_handle[m], 1, m);
+		starpu_mpi_data_register(d_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(q_handle[m], 1, m);
+		starpu_mpi_data_register(q_handle[m], ++mpi_tag, mpi_rank);
+
+		if (use_reduction)
+		{
+			starpu_data_set_reduction_methods(q_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
+			starpu_data_set_reduction_methods(r_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+
+			if (mpi_rank == rank)
+			{
+				starpu_matrix_data_register(&A_handle[m][n], STARPU_MAIN_RAM, (uintptr_t) A[m][n], block_size, block_size, block_size, sizeof(TYPE));
+			}
+			else
+			{
+				starpu_matrix_data_register(&A_handle[m][n], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
+			}
+
+			starpu_data_set_coordinates(A_handle[m][n], 2, n, m);
+			starpu_mpi_data_register(A_handle[m][n], ++mpi_tag, mpi_rank);
+		}
+	}
+
+	starpu_variable_data_register(&dtq_handle, STARPU_MAIN_RAM, (uintptr_t)&dtq, sizeof(TYPE));
+	starpu_variable_data_register(&rtr_handle, STARPU_MAIN_RAM, (uintptr_t)&rtr, sizeof(TYPE));
+	starpu_mpi_data_register(rtr_handle, ++mpi_tag, 0);
+	starpu_mpi_data_register(dtq_handle, ++mpi_tag, 0);
+
+	if (use_reduction)
+	{
+		starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
+		starpu_data_set_reduction_methods(rtr_handle, &accumulate_variable_cl, &bzero_variable_cl);
+	}
+}
+
+static void unregister_data(void)
+{
+	unsigned m, n;
+
+	for (m = 0; m < nblocks; m++)
+	{
+		starpu_data_unregister(x_handle[m]);
+		starpu_data_unregister(b_handle[m]);
+		starpu_data_unregister(r_handle[m]);
+		starpu_data_unregister(d_handle[m]);
+		starpu_data_unregister(q_handle[m]);
+
+		for (n = 0; n < nblocks; n++)
+		{
+			starpu_data_unregister(A_handle[m][n]);
+		}
+
+		free(A_handle[m]);
+	}
+
+	starpu_data_unregister(dtq_handle);
+	starpu_data_unregister(rtr_handle);
+
+	free(A_handle);
+	free(x_handle);
+	free(b_handle);
+	free(r_handle);
+	free(d_handle);
+	free(q_handle);
+}
+
+static void display_x_result(void)
+{
+	int j, i;
+
+	for (j = 0; j < nblocks; j++)
+	{
+		starpu_mpi_get_data_on_node(MPI_COMM_WORLD, x_handle[j], 0);
+	}
+
+	if (rank == 0)
+	{
+		FPRINTF_SERVER(stderr, "Computed X vector:\n");
+		for (j = 0; j < nblocks; j++)
+		{
+			starpu_data_acquire(x_handle[j], STARPU_R);
+			for (i = 0; i < block_size; i++)
+			{
+				FPRINTF(stderr, "% 02.2e\n", x[j][i]);
+			}
+			starpu_data_release(x_handle[j]);
+		}
+	}
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-p") == 0)
+		{
+			nodes_p = atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
+		{
+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-p node_grid_width] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			exit(-1);
+		}
+	}
+
+	parse_common_args(argc, argv);
+}
+
+int main(int argc, char **argv)
+{
+	int worldsize, ret;
+	double start, end;
+
+	/* Not supported yet */
+	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
+		return 77;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	parse_args(argc, argv);
+
+	if (worldsize % nodes_p != 0)
+	{
+		FPRINTF_SERVER(stderr, "Node grid (%d) width must divide the number of nodes (%d).\n", nodes_p, worldsize);
+		starpu_mpi_shutdown();
+		return 1;
+	}
+	nodes_q = worldsize / nodes_p;
+
+	if (n % nblocks != 0)
+	{
+		FPRINTF_SERVER(stderr, "The number of blocks (%d) must divide the matrix size (%lld).\n", nblocks, n);
+		starpu_mpi_shutdown();
+		return 1;
+	}
+	block_size = n / nblocks;
+
+	starpu_cublas_init();
+
+	FPRINTF_SERVER(stderr, "************** PARAMETERS ***************\n");
+	FPRINTF_SERVER(stderr, "%d nodes (%dx%d)\n", worldsize, nodes_p, nodes_q);
+	FPRINTF_SERVER(stderr, "Problem size (-n): %lld\n", n);
+	FPRINTF_SERVER(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
+	FPRINTF_SERVER(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
+	FPRINTF_SERVER(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	start = starpu_timing_now();
+	generate_random_problem();
+	register_data();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	end = starpu_timing_now();
+
+	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/10e6);
+
+	ret = cg();
+	if (ret == -ENODEV)
+	{
+		ret = 77;
+		goto enodev;
+	}
+
+	starpu_task_wait_for_all();
+
+	if (display_result)
+	{
+		display_x_result();
+	}
+
+enodev:
+	unregister_data();
+	free_data();
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	return ret;
+}

+ 201 - 0
mpi/examples/mpi_redux/mpi_redux.c

@@ -0,0 +1,201 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example illustrates how to use the STARPU_MPI_REDUX mode
+ * and compare it with the standard STARPU_REDUX.
+ *
+ * In order to make this comparison salliant, the init codelet is not
+ * a task that set the handle to a neutral element but rather depends
+ * on the working node.
+ * This is not a proper way to use a reduction pattern however it
+ * can be analogous to the cost/weight of each contribution.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+#include <unistd.h>
+
+static void cl_cpu_work(void *handles[], void*arg)
+{
+	(void)arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	*a = 3.0 + *a + *b;
+	printf("%f\n",*a);
+}
+
+static struct starpu_codelet work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_REDUX, STARPU_R },
+	.name = "task_init"
+};
+
+static struct starpu_codelet mpi_work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
+	.name = "task_init-mpi"
+};
+
+static void cl_cpu_task_init(void *handles[], void*arg)
+{
+	(void) arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	sleep(1);
+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), *a);
+	*a = starpu_mpi_world_rank();
+}
+
+static struct starpu_codelet task_init_cl =
+{
+	.cpu_funcs = { cl_cpu_task_init },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+	.name = "task_init"
+};
+
+static void cl_cpu_task_red(void *handles[], void*arg)
+{
+	(void) arg;
+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	*ad = *ad + *as;
+}
+
+static struct starpu_codelet task_red_cl =
+{
+	.cpu_funcs = { cl_cpu_task_red },
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+	.name = "task_red"
+};
+
+int main(int argc, char *argv[])
+{
+	int comm_rank, comm_size;
+	/* Initializes STarPU and the StarPU-MPI layer */
+	starpu_fxt_autostart_profiling(0);
+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
+
+	int nworkers = starpu_cpu_worker_get_count();
+	if (nworkers < 2)
+	{
+        	FPRINTF(stderr, "We need at least 2 CPU worker per node.\n");
+        	starpu_mpi_shutdown();
+       		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
+	if (comm_size < 2)
+	{
+        	FPRINTF(stderr, "We need at least 2 nodes.\n");
+        	starpu_mpi_shutdown();
+       		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+	double a, b[comm_size];
+	starpu_data_handle_t a_h, b_h[comm_size];
+	double work_coef = 2;
+	enum starpu_data_access_mode codelet_mode;
+	enum starpu_data_access_mode task_mode;
+	int i,j,work_node;
+    	starpu_mpi_tag_t tag = 0;
+	for (i = 0 ; i < 2 ; i++)
+	{
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (i==0)
+			task_mode = STARPU_MPI_REDUX;
+		else
+			task_mode = STARPU_REDUX;
+		if (comm_rank == 0)
+		{
+			a = 1.0;
+			printf("init a = %f\n", a);
+			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
+			for (j=0;j<comm_size;j++)
+				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+		}
+		else
+		{
+			b[comm_rank] = 1.0 / (comm_rank + 1.0);
+			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
+			for (j=0;j<comm_size;j++)
+			{
+				if (j == comm_rank)
+					starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
+				else
+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+			}
+		}
+		starpu_mpi_data_register(a_h, tag++, 0);
+		for (j=0;j<comm_size;j++)
+			starpu_mpi_data_register(b_h[j], tag++, j);
+
+		starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
+		starpu_fxt_start_profiling();
+		for (work_node=1; work_node < comm_size;work_node++)
+		{
+			for (j=1;j<=work_coef*nworkers;j++)
+			{
+				if (i == 0)
+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
+					&mpi_work_cl,
+					task_mode, a_h,
+					STARPU_R, b_h[work_node],
+					STARPU_EXECUTE_ON_NODE, work_node,
+					0);
+				else
+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
+					&work_cl,
+					task_mode, a_h,
+					STARPU_R, b_h[work_node],
+					STARPU_EXECUTE_ON_NODE, work_node,
+					0);
+			}
+		}
+		starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
+		starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (comm_rank == 0)
+		{
+			double tmp = 0.0;
+			for (work_node = 1; work_node < comm_size ; work_node++)
+				tmp += 1.0 / (work_node + 1.0);
+			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
+		}
+		starpu_data_unregister(a_h);
+		for (work_node=0; work_node < comm_size;work_node++)
+			starpu_data_unregister(b_h[work_node]);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+	}
+	starpu_mpi_shutdown();
+	return 0;
+}

+ 253 - 0
mpi/examples/native_fortran/nf_mpi_redux.f90

@@ -0,0 +1,253 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program nf_mpi_redux
+  use iso_c_binding
+  use fstarpu_mod
+  use fstarpu_mpi_mod
+
+  implicit none
+
+  integer, target                         :: ret, np, i, j, trial
+  type(c_ptr)                             :: work_cl, task_rw_cl,task_red_cl, task_ini_cl
+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
+  real(kind(1.d0)), target                :: a,tmp
+  real(kind(1.d0)), target, allocatable   :: b(:)
+  integer(kind=8)                         :: tag, err
+  type(c_ptr)                             :: ahdl
+  type(c_ptr), target, allocatable        :: bhdl(:)
+  type(c_ptr)                             :: task_mode, codelet_mode
+  integer, target                         :: comm_world,comm_w_rank, comm_size
+  integer(c_int), target                  :: w_node, nworkers, work_coef
+
+  call fstarpu_fxt_autostart_profiling(0)
+  ret = fstarpu_init(c_null_ptr)
+  ret = fstarpu_mpi_init(1)
+
+  comm_world = fstarpu_mpi_world_comm()
+  comm_w_rank  = fstarpu_mpi_world_rank()
+  comm_size  = fstarpu_mpi_world_size()
+  if (comm_size.lt.2) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with at least two nodes.")')
+    stop 2
+  end if
+  allocate(b(comm_size-1), bhdl(comm_size-1))
+  nworkers = fstarpu_worker_get_count()
+  if (nworkers.lt.1) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with at least one worker per node.")')
+    stop 2
+  end if
+
+  ! allocate and reduction codelets
+  task_red_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_red_cl, namered)
+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
+
+  task_ini_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
+
+  work_coef=2
+
+  do trial=1,2
+
+  if (trial.eq.1) then
+        write(*,*) "Using STARPU_MPI_REDUX"
+        codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
+        task_mode = FSTARPU_MPI_REDUX
+  else if (trial.eq.2) then
+        write(*,*) "Using STARPU_REDUX"
+        codelet_mode = FSTARPU_REDUX
+        task_mode = FSTARPU_REDUX
+  end if
+  ! allocate and fill codelet structs
+  work_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(work_cl, name)
+  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
+  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
+  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
+  err = fstarpu_mpi_barrier(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    write(*,'(" ")')
+    a = 1.0
+    write(*,*) "init a = ", a
+  else
+    b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
+    write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank), " AT ", &
+c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
+  end if
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  tag = 0
+  if(comm_w_rank.eq.0) then
+    call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
+    do i=1,comm_size-1
+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+    end do
+  else
+    call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
+    do i=1,comm_size-1
+      if (i.eq.comm_w_rank) then
+        call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
+      else
+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+      end if
+    end do
+  end if
+  call fstarpu_mpi_data_register(ahdl,  tag,  0)
+  do i=1,comm_size-1
+     call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
+  end do
+
+  tag = tag + comm_size
+
+  call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+
+  call fstarpu_fxt_start_profiling()
+  do w_node=1,comm_size-1
+    do i=1,work_coef*nworkers
+      call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
+             work_cl,                                         &
+             task_mode, ahdl,                            &
+             FSTARPU_R, bhdl(w_node),                      &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+    end do
+  end do
+  call fstarpu_mpi_redux_data(comm_world, ahdl)
+  err = fstarpu_mpi_wait_for_all(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    tmp = 0
+    do w_node=1,comm_size-1
+      tmp = tmp + 1.0 / (w_node+1.0)
+    end do
+    write(*,*) 'computed result ---> ',a, "expected =",&
+      1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
+  end if
+  err = fstarpu_mpi_barrier(comm_world)
+  call fstarpu_data_unregister(ahdl)
+  do w_node=1,comm_size-1
+    call fstarpu_data_unregister(bhdl(w_node))
+  end do
+  call fstarpu_codelet_free(work_cl)
+
+  end do
+
+  call fstarpu_fxt_stop_profiling()
+  call fstarpu_codelet_free(task_red_cl)
+  call fstarpu_codelet_free(task_ini_cl)
+
+
+  err = fstarpu_mpi_shutdown()
+  call fstarpu_shutdown()
+  deallocate(b, bhdl)
+  stop
+
+contains
+
+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer        :: comm_rank
+    integer, target :: i
+    real(kind(1.d0)), pointer :: a, b
+    real(kind(1.d0))          :: old_a
+
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
+    call nf_sleep(1.d0)
+    old_a = a
+    a = old_a + 3.0 + b
+    write(*,*) "task   (c_w_rank:",comm_rank," worker_id:",worker_id,") from ",old_a,"to",a
+
+    return
+  end subroutine cl_cpu_task
+
+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: as, ad
+    real(kind(1.d0))           :: old_ad
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
+    old_ad = ad
+    ad = ad + as
+    call nf_sleep(1.d0)
+    write(*,*) "red_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,")",as, old_ad, ' ---> ',ad
+
+    return
+  end subroutine cl_cpu_task_red
+
+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args
+        ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: a
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call nf_sleep(0.5d0)
+    ! As this codelet is run by each worker in the REDUX mode case
+    ! this initialization makes salient the number of copies spawned
+    write(*,*) "ini_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,") set to", comm_rank, "(was",a,")"
+    a = comm_rank
+    return
+  end subroutine cl_cpu_task_ini
+
+  subroutine nf_sleep(t)
+    implicit none
+    integer :: t_start, t_end, t_rate
+    real(kind(1.d0))     :: ta, t
+    call system_clock(t_start)
+    do
+       call system_clock(t_end, t_rate)
+       ta = real(t_end-t_start)/real(t_rate)
+       if(ta.gt.t) return
+    end do
+  end subroutine nf_sleep
+
+end program

+ 238 - 0
mpi/examples/native_fortran/nf_redux_test.f90

@@ -0,0 +1,238 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program main
+  use iso_c_binding
+  use fstarpu_mod
+  use fstarpu_mpi_mod
+
+  implicit none
+
+  integer, target                         :: ret, np, i, j
+  type(c_ptr)                             :: task_cl, task_rw_cl, task_red_cl, task_ini_cl
+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
+  real(kind(1.d0)), target                :: a1, a2, b1, b2
+  integer(kind=8)                          :: tag, err
+  type(c_ptr)                             :: a1hdl, a2hdl, b1hdl, b2hdl
+  integer, target                         :: comm, comm_world, comm_w_rank, comm_size
+  integer(c_int), target                  :: w_node
+
+  call fstarpu_fxt_autostart_profiling(0)
+  ret = fstarpu_init(c_null_ptr)
+  ret = fstarpu_mpi_init(1)
+
+  comm_world = fstarpu_mpi_world_comm()
+  comm_w_rank  = fstarpu_mpi_world_rank()
+  comm_size  = fstarpu_mpi_world_size()
+  if (comm_size.ne.4) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with 4 MPI")')
+    stop 1
+  end if
+  err   = fstarpu_mpi_barrier(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    write(*,'(" ")')
+    a1 = 1.0
+    write(*,*) "init_a1", a1
+    b1 = 0.5
+    write(*,*) "init b1", b1
+  end if
+  if(comm_w_rank.eq.1) then
+    write(*,'(" ")')
+    a2 = 2.0
+    write(*,*) "init_a2", a2
+    b2 = 0.8
+    write(*,*) "init b2", b2
+  end if
+
+  ! allocate and fill codelet structs
+  task_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_cl, name)
+  call fstarpu_codelet_add_cpu_func(task_cl, C_FUNLOC(cl_cpu_task))
+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_REDUX)
+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_R)
+
+  ! allocate and reduction codelets
+  task_red_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_red_cl, namered)
+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
+
+  task_ini_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  tag = 0
+  if(comm_w_rank.eq.0) then
+        call fstarpu_variable_data_register(a1hdl, 0, c_loc(a1),c_sizeof(a1))
+        call fstarpu_variable_data_register(b1hdl, 0, c_loc(b1),c_sizeof(b1))
+  else
+        call fstarpu_variable_data_register(a1hdl, -1, c_null_ptr,c_sizeof(a1))
+        call fstarpu_variable_data_register(b1hdl, -1, c_null_ptr,c_sizeof(b1))
+  end if
+  call fstarpu_mpi_data_register(a1hdl,tag,0)
+  call fstarpu_mpi_data_register(b1hdl, tag+1,0)
+
+  tag = tag + 2
+  if(comm_w_rank.eq.1) then
+        call fstarpu_variable_data_register(a2hdl, 0, c_loc(a2),c_sizeof(a2))
+        call fstarpu_variable_data_register(b2hdl, 0, c_loc(b2),c_sizeof(b2))
+  else
+        call fstarpu_variable_data_register(a2hdl, -1, c_null_ptr,c_sizeof(a2))
+        call fstarpu_variable_data_register(b2hdl, -1, c_null_ptr,c_sizeof(b2))
+  end if
+  call fstarpu_mpi_data_register(a2hdl,tag,1)
+  call fstarpu_mpi_data_register(b2hdl, tag+1, 1)
+  tag = tag + 2
+
+  call fstarpu_data_set_reduction_methods(a1hdl, task_red_cl,task_ini_cl)
+  call fstarpu_data_set_reduction_methods(a2hdl, task_red_cl,task_ini_cl)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  call fstarpu_fxt_start_profiling()
+
+  w_node = 3
+  comm = comm_world
+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
+             task_cl,                                         &
+             FSTARPU_REDUX, a1hdl,                            &
+             FSTARPU_R, b1hdl,                                &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+  w_node = 2
+  comm = comm_world
+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
+             task_cl,                                         &
+             FSTARPU_REDUX, a2hdl,                            &
+             FSTARPU_R, b2hdl,                                &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+
+  call fstarpu_mpi_redux_data(comm_world, a1hdl)
+  call fstarpu_mpi_redux_data(comm_world, a2hdl)
+  ! write(*,*) "waiting all tasks ..."
+  err = fstarpu_mpi_wait_for_all(comm_world)
+
+  if(comm_w_rank.eq.0) then
+     write(*,*) 'computed result ---> ',a1, "expected =",4.5
+  end if
+  if(comm_w_rank.eq.1) then
+     write(*,*) 'computed result ---> ',a2, "expected=",5.8
+  end if
+  call fstarpu_data_unregister(a1hdl)
+  call fstarpu_data_unregister(a2hdl)
+  call fstarpu_data_unregister(b1hdl)
+  call fstarpu_data_unregister(b2hdl)
+
+  call fstarpu_fxt_stop_profiling()
+  call fstarpu_codelet_free(task_cl)
+  call fstarpu_codelet_free(task_red_cl)
+  call fstarpu_codelet_free(task_ini_cl)
+
+
+  err = fstarpu_mpi_shutdown()
+  call fstarpu_shutdown()
+
+  stop
+
+contains
+
+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer        :: comm_rank
+    integer, target :: i
+    real(kind(1.d0)), pointer :: a, b
+    real(kind(1.d0))          :: old_a
+
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
+    call nf_sleep(1.d0)
+    old_a = a
+    a = 3.0 + b
+    write(*,*) "task   (c_w_rank:",comm_rank,") from ",old_a,"to",a
+
+    return
+  end subroutine cl_cpu_task
+
+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: as, ad
+    real(kind(1.d0))           :: old_ad
+
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
+    old_ad = ad
+    ad = ad + as
+    call nf_sleep(1.d0)
+    write(*,*) "red_cl (c_w_rank:",comm_rank,")",as, old_ad, ' ---> ',ad
+
+    return
+  end subroutine cl_cpu_task_red
+
+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args
+        ! cl_args is unused
+    integer(c_int) :: ret
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: a
+
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call nf_sleep(0.5d0)
+    a = 0.0
+    write(*,*) "ini_cl (c_w_rank:",comm_rank,")"
+    return
+  end subroutine cl_cpu_task_ini
+
+  subroutine nf_sleep(t)
+    implicit none
+    integer :: t_start, t_end, t_rate
+    real(kind(1.d0))     :: ta, t
+    call system_clock(t_start)
+    do
+       call system_clock(t_end, t_rate)
+       ta = real(t_end-t_start)/real(t_rate)
+       if(ta.gt.t) return
+    end do
+  end subroutine nf_sleep
+
+end program main

+ 9 - 0
mpi/include/starpu_mpi.h

@@ -232,6 +232,11 @@ int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, s
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 
 /**
+   Same of starpu_mpi_irecv_detached but with the \p prio parameter.
+*/
+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
+
+/**
    Post a nonblocking receive in \p data_handle from the node \p
    source using the message tag \p data_tag within the communicator \p
    comm. On completion, the \p callback function is called with the
@@ -561,6 +566,10 @@ int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
    Return the tag of the given data.
 */
 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
+/**
+   Return the redux map of the given data.
+*/
+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t handle);
 
 /**
    Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()

+ 0 - 1
mpi/src/mpi/starpu_mpi_early_data.h

@@ -40,7 +40,6 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 	  void *buffer;
 	  size_t size;
 	  unsigned buffer_node;
-	  int req_ready;
 	  struct _starpu_mpi_node_tag node_tag;
 	  starpu_pthread_mutex_t req_mutex;
 	  starpu_pthread_cond_t req_cond;

+ 40 - 34
mpi/src/mpi/starpu_mpi_mpi.c

@@ -50,6 +50,9 @@ static unsigned nready_process;
 /* Number of send requests to submit to MPI at the same time */
 static unsigned ndetached_send;
 
+/* Force allocation of early data */
+static int early_data_force_allocate;
+
 #ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 #endif
@@ -81,6 +84,11 @@ static starpu_pthread_t progress_thread;
 #endif
 static int running = 0;
 
+/* Provides synchronization between an early request, a sync request, and an early data handle:
+ * we keep it held while checking and posting one to prevent the other.
+ * This is to be taken always before the progress_mutex. */
+static starpu_pthread_mutex_t early_data_mutex;
+
 /* Driver taken by StarPU-MPI to process tasks when there is no requests to
  * handle instead of polling endlessly */
 static struct starpu_driver *mpi_driver = NULL;
@@ -103,7 +111,7 @@ static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
 
-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio);
 
 #ifdef STARPU_SIMGRID
 #pragma weak smpi_simulated_main_
@@ -182,8 +190,6 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
 
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
-
 	if (req->request_type == RECV_REQ)
 	{
 		/* Case : the request is the internal receive request submitted
@@ -206,6 +212,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 			}
 
+			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 					  req->datatype_name, (int)req->count, req->registered_datatype);
@@ -213,31 +220,24 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			_STARPU_MPI_INC_READY_REQUESTS(+1);
 
 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
-			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->posted_mutex);
 			req->posted = 1;
 			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->posted_mutex);
-			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		}
 		else
 		{
+			STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 
 			if (early_data_handle)
 			{
+				/* Got the early_data_handle */
+				STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
+
 				/* Case: a receive request for a data with the given tag and source has already been
 				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
 				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
 				 * will be called to bring the data back to the original data handle associated to the request.*/
-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
-				while (!(early_data_handle->req_ready))
-					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
-				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
-				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
-
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 
@@ -254,9 +254,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				cb_args->req = req;
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
-				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL);
+				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL, req->prio);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			else
@@ -265,6 +264,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				{
+					/* Got the sync req */
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 					/* Case: we already received the send envelope, we can proceed with the receive */
 					req->sync = 1;
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
@@ -279,6 +280,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 						STARPU_ASSERT(req->count);
 						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 					}
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
 					/* Throw away the dumb request that was only used to know that we got the envelope */
@@ -288,13 +290,17 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				{
 					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					_starpu_mpi_early_request_enqueue(req);
+					/* We have queued our early request, we can let the progression thread look at it */
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 				}
 			}
 		}
 	}
 	else
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		if (req->request_type == SEND_REQ)
 			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
 		else
@@ -1157,13 +1163,11 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	_starpu_mpi_early_data_add(early_data_handle);
 
 	starpu_data_handle_t data_handle;
-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
 	// TODO: rather select some memory node next to the NIC
 	unsigned buffer_node = STARPU_MAIN_RAM;
-	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID)
+	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID && !early_data_force_allocate)
 	{
 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
 		early_data_handle->buffer = NULL;
@@ -1190,25 +1194,16 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
-							  NULL, NULL, 1, 1, envelope->size);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+							  NULL, NULL, 1, 1, envelope->size, STARPU_DEFAULT_PRIO);
+	/* The early data handle is ready, we can let _starpu_mpi_submit_ready_request
+	 * proceed with acquiring it */
+	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 
+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	// We wait until the request is pushed in the
 	// ready_request list
-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->backend->posted_mutex));
 	while (!(early_data_handle->req->posted))
-		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &(early_data_handle->req->backend->posted_mutex));
-	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->backend->posted_mutex));
-
-#ifdef STARPU_DEVEL
-#warning check if req_ready is still necessary
-#endif
-	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
-	early_data_handle->req_ready = 1;
-	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &progress_mutex);
 
 	// Handle the request immediatly to make sure the mpi_irecv is
 	// posted before receiving an other envelope
@@ -1421,6 +1416,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
 
+					STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
 
 					/* Case: a data will arrive before a matching receive is
@@ -1453,9 +1451,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 							new_req->backend->is_internal_req = 0; // ????
 							new_req->count = envelope->size;
 							_starpu_mpi_sync_data_add(new_req);
+							/* We have queued our sync request, we can let _starpu_mpi_submit_ready_request find it */
+							STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 						}
 						else
 						{
+							/* This will release early_data_mutex when appropriate */
 							_starpu_mpi_receive_early_data(envelope, envelope_status, envelope_comm);
 						}
 					}
@@ -1466,6 +1467,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * _starpu_mpi_handle_ready_request. */
 					else
 					{
+						/* Got the early request */
+						STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
 
@@ -1621,6 +1624,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 {
         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
+        STARPU_PTHREAD_MUTEX_INIT(&early_data_mutex, NULL);
         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
 	_starpu_mpi_req_list_init(&ready_recv_requests);
@@ -1634,6 +1638,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
 	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
 	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
+	early_data_force_allocate = starpu_get_env_number_default("STARPU_MPI_EARLYDATA_ALLOCATE", 0);
 
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
@@ -1688,6 +1693,7 @@ void _starpu_mpi_progress_shutdown(void **value)
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_ready_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
+        STARPU_PTHREAD_MUTEX_DESTROY(&early_data_mutex);
         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
 }
 

+ 0 - 2
mpi/src/mpi/starpu_mpi_mpi_backend.c

@@ -54,7 +54,6 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 
 	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->posted_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
 
 	//req->backend->other_request = NULL;
@@ -80,7 +79,6 @@ void _starpu_mpi_mpi_backend_request_destroy(struct _starpu_mpi_req *req)
 {
 	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
-	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->posted_mutex);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
 	free(req->backend);
 	req->backend = NULL;

+ 0 - 1
mpi/src/mpi/starpu_mpi_mpi_backend.h

@@ -54,7 +54,6 @@ struct _starpu_mpi_req_backend
 
 	starpu_pthread_mutex_t req_mutex;
 	starpu_pthread_cond_t req_cond;
-	starpu_pthread_mutex_t posted_mutex;
 	starpu_pthread_cond_t posted_cond;
 	/** In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */

+ 28 - 9
mpi/src/starpu_mpi.c

@@ -161,12 +161,12 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 
 	if (sequential_consistency)
 	{
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
 	}
 	else
 	{
 		/* post_sync_job_id has already been filled */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
 	}
 }
 
@@ -289,7 +289,7 @@ int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starp
 	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
 }
 
-struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
+struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
 {
 	if (_starpu_mpi_fake_world_size != -1)
 	{
@@ -297,7 +297,7 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 		return NULL;
 	}
 
-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
 	_starpu_mpi_req_willpost(req);
 
 	if (sequential_consistency == 0)
@@ -317,7 +317,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
 	struct _starpu_mpi_req *req;
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
-	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
+	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
 
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
@@ -331,7 +331,17 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, star
 {
 	_STARPU_MPI_LOG_IN();
 
-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+	_STARPU_MPI_LOG_IN();
+
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
+
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }
@@ -340,7 +350,7 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 {
 	_STARPU_MPI_LOG_IN();
 
-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
@@ -379,10 +389,13 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 {
+	struct _starpu_mpi_data *data = data_handle->mpi_data;
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
-	_starpu_spin_destroy(&((struct _starpu_mpi_data*) data_handle->mpi_data)->coop_lock);
-	free(data_handle->mpi_data);
+	_starpu_spin_destroy(&data->coop_lock);
+	if (data->redux_map != REDUX_CONTRIB)
+		free(data->redux_map);
+	free(data);
 	data_handle->mpi_data = NULL;
 }
 
@@ -448,6 +461,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
 }
 
+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
+{
+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
+}
+
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 {
 	int me, rank;

+ 1 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -297,8 +297,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 
 	if (first)
 		/* We were first, we are responsible for acquiring the data for everybody */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL, req->prio);
 	else
 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
 }
-

+ 16 - 6
mpi/src/starpu_mpi_private.h

@@ -118,7 +118,7 @@ int _starpu_debug_rank;
 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
 			fflush(stderr);	\
 		} \
-	} while(0);
+	} while(0)
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
@@ -130,7 +130,7 @@ int _starpu_debug_rank;
 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
 			fflush(stderr); \
 		} \
-	} while(0);
+	} while(0)
 #else
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
@@ -141,10 +141,10 @@ int _starpu_debug_rank;
 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
 	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
                                              fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
-                                             fflush(stderr); }} while(0);
+                                             fflush(stderr); }} while(0)
 #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
-                                             fflush(stderr); } while(0);
+                                             fflush(stderr); } while(0)
 
 #ifdef STARPU_MPI_EXTRA_VERBOSE
 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
@@ -203,6 +203,12 @@ struct _starpu_mpi_coop_sends
 	long pre_sync_jobid;
 };
 
+/** cf. redux_map field : this is the value
+ * put in this field whenever a node contributes
+ * to the reduction of the data.
+ * Only the owning node keeps track of all the contributing nodes. */
+#define REDUX_CONTRIB ((char*) -1)
+
 /** Initialized in starpu_mpi_data_register_comm */
 struct _starpu_mpi_data
 {
@@ -211,8 +217,12 @@ struct _starpu_mpi_data
 	char *cache_sent;
 	int cache_received;
 
-	/** Rendez-vous data for opportunistic cooperative sends */
-	/** Needed to synchronize between submit thread and workers */
+	/** Array used to store the contributing nodes to this data
+	  * when it is accessed in REDUX mode. */
+	char* redux_map;
+
+	/** Rendez-vous data for opportunistic cooperative sends,
+	  * Needed to synchronize between submit thread and workers */
 	struct _starpu_spinlock coop_lock;
 	/** Current cooperative send bag */
 	struct _starpu_mpi_coop_sends *coop_sends;

+ 55 - 20
mpi/src/starpu_mpi_task_insert.c

@@ -100,7 +100,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 	{
 		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
 	}
-	if (data && mode & STARPU_R)
+	if (data && mode & STARPU_R && !(mode & STARPU_MPI_REDUX))
 	{
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
@@ -118,7 +118,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 				if (data_tag == -1)
 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
-				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached_prio(data, mpi_rank, data_tag, prio, comm, NULL, NULL);
 			}
 			// else the node has already received the data
 		}
@@ -142,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 static
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
 {
-	if (mode & STARPU_W)
+	if (mode & STARPU_W && !(mode & STARPU_MPI_REDUX))
 	{
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
@@ -179,7 +179,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 {
 	if (_starpu_cache_enabled)
 	{
-		if (mode & STARPU_W || mode & STARPU_REDUX)
+		if ((mode & STARPU_W && !(mode & STARPU_MPI_REDUX)) || mode & STARPU_REDUX)
 		{
 			/* The data has been modified, it MUST be removed from the cache */
 			starpu_mpi_cached_send_clear(data);
@@ -189,7 +189,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 	else
 	{
 		/* We allocated a temporary buffer for the received data, now drop it */
-		if ((mode & STARPU_R) && do_execute)
+		if ((mode & STARPU_R && !(mode & STARPU_MPI_REDUX)) && do_execute)
 		{
 			int mpi_rank = starpu_mpi_data_get_rank(data);
 			if (mpi_rank == STARPU_MPI_PER_NODE)
@@ -254,7 +254,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 				inconsistent_execute = 0;
 			}
 		}
-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
 		{
 			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
 			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
@@ -617,6 +617,20 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
 	for(i=0 ; i<nb_data ; i++)
 	{
+		if ((descrs[i].mode & STARPU_REDUX || descrs[i].mode & STARPU_MPI_REDUX) && descrs[i].handle)
+		{
+			struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
+			if (me == starpu_mpi_data_get_rank(descrs[i].handle))
+			{
+				int size;
+				starpu_mpi_comm_size(comm, &size);
+				if (mpi_data->redux_map == NULL)
+					_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
+				mpi_data->redux_map [xrank] = 1;
+			}
+			else if (me == xrank)
+				mpi_data->redux_map = REDUX_CONTRIB;
+		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
 	}
@@ -813,6 +827,11 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args,
 
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
  * a data previously accessed in REDUX mode gets accessed in R mode. */
+/* FIXME: In order to prevent simultaneous receive submissions
+ * on the same handle, we need to wait that all the starpu_mpi
+ * tasks are done before submitting next tasks. The current
+ * version of the implementation does not support multiple
+ * simultaneous receive requests on the same handle.*/
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 {
 	int me, rank, nb_nodes;
@@ -820,6 +839,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	data_tag = starpu_mpi_data_get_tag(data_handle);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	if (rank == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
@@ -832,12 +852,16 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
-	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
-
+	_STARPU_MPI_DEBUG(50, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
 	// need to count how many nodes have the data in redux mode
 	if (me == rank)
 	{
-		int i;
+		int i,j;
+		_STARPU_MPI_DEBUG(50, "Who is in the map ?\n");
+		for (j = 0; j<nb_nodes; j++)
+		{
+			_STARPU_MPI_DEBUG(50, "%d is in the map ? %d\n", j, mpi_data->redux_map[j]);
+		}
 
 		// taskC depends on all taskBs created
 		// Creating synchronization task and use its jobid for tracing
@@ -848,8 +872,9 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
 		for(i=0 ; i<nb_nodes ; i++)
 		{
-			if (i != rank)
+			if (i != rank && mpi_data->redux_map[i])
 			{
+				_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p \n", i, data_handle);
 				/* We need to make sure all is
 				 * executed after data_handle finished
 				 * its last read access, we hence do
@@ -893,24 +918,34 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
 						   0);
 			}
+			else
+			{
+				_STARPU_MPI_DEBUG(5, "%d is not in the map or is me\n", i);
+			}
 		}
 
 		int ret = starpu_task_submit(taskC);
 		STARPU_ASSERT(ret == 0);
 	}
-	else
+	else if (mpi_data->redux_map)
 	{
-		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
+		STARPU_ASSERT(mpi_data->redux_map == REDUX_CONTRIB);
+		_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", rank);
 		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
-		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
+		starpu_data_invalidate_submit(data_handle);
 	}
-	/* FIXME: In order to prevent simultaneous receive submissions
-	 * on the same handle, we need to wait that all the starpu_mpi
-	 * tasks are done before submitting next tasks. The current
-	 * version of the implementation does not support multiple
-	 * simultaneous receive requests on the same handle.*/
-	starpu_task_wait_for_all();
-
+	else
+	{
+		_STARPU_MPI_DEBUG(5, "I am not in the map of %d, I am %d ...\n", rank, me);
+	}
+	if (mpi_data->redux_map != NULL)
+	{
+		_STARPU_MPI_DEBUG(100, "waiting for redux tasks with %d\n", rank);
+		starpu_task_wait_for_all();
+	}
+	if (me == rank)
+		free(mpi_data->redux_map);
+	mpi_data->redux_map = NULL;
 }
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 {

+ 1 - 1
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -74,7 +74,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 				inconsistent_execute = 0;
 			}
 		}
-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
 		{
 			arg_i++;
 			starpu_data_handle_t data = arglist[arg_i];

+ 1 - 1
mpi/tests/mpi_reduction.c

@@ -37,7 +37,7 @@ static struct starpu_codelet init_codelet =
 static struct starpu_codelet redux_codelet =
 {
 	.cpu_funcs = {redux_cpu_func},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 #ifdef STARPU_SIMGRID
 	.model = &starpu_perfmodel_nop,

+ 3 - 0
mpi/tests/mpi_redux.c

@@ -14,6 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+/* This test does a manual reduction: all ranks send a number to the rank 0,
+ * the rank 0 sums these numbers and sends back the result to all ranks. */
+
 #include <starpu_mpi.h>
 #include "helper.h"
 

+ 20 - 20
src/common/fxt.h

@@ -342,7 +342,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE1STR
@@ -356,7 +356,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE2STR
@@ -377,7 +377,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE2STR
@@ -388,7 +388,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE3STR
@@ -410,7 +410,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE3STR
@@ -421,7 +421,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE4STR
@@ -444,7 +444,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE4STR
@@ -455,7 +455,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE5STR
@@ -479,7 +479,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE5STR
@@ -490,7 +490,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE6STR
@@ -515,7 +515,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE6STR
@@ -526,7 +526,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_ALWAYS_PROBE7STR
@@ -552,7 +552,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 
 #ifdef FUT_FULL_PROBE7STR
@@ -563,7 +563,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
     }									\
-} while (0);
+} while (0)
 #endif
 
 #ifndef FUT_RAW_PROBE7
@@ -787,7 +787,7 @@ do {									\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
 		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
-} while(0);
+} while(0)
 
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
 do {									\
@@ -796,7 +796,7 @@ do {									\
 	char _archname[32]=""; \
 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
 	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
-} while(0);
+} while(0)
 
 #define _STARPU_TRACE_START_EXECUTING()				\
 	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
@@ -898,7 +898,7 @@ do {										\
 	else {									\
 		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 	}									\
-} while(0);
+} while(0)
 
 #define _STARPU_TRACE_DATA_NAME(handle, name) \
 	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_DATA_NAME, handle, name)
@@ -1319,8 +1319,8 @@ do {										\
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
        FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
 
-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre)          \
-       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req)          \
+       FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre, req)
 
 
 #else // !STARPU_USE_FXT
@@ -1451,7 +1451,7 @@ do {										\
 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre);} while(0)
+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre); (void)(req); } while(0)
 #define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
 
 #endif // STARPU_USE_FXT

+ 5 - 0
src/common/hash.c

@@ -46,6 +46,11 @@ uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 	return crc;
 }
 
+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc)
+{
+	return starpu_hash_crc32c_be_n(&input, sizeof(input), inputcrc);
+}
+
 uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
 {
 	uint8_t *p = (uint8_t *)&input;

+ 4 - 4
src/common/uthash.h

@@ -104,12 +104,12 @@ do {
   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
-} while (0);
+} while (0)
 
 #define HASH_BLOOM_FREE(tbl)                                                     \
 do {                                                                             \
   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
-} while (0);
+} while (0)
 
 #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
 #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
@@ -368,7 +368,7 @@ do {
   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
       hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
   bkt = hashv & (num_bkts-1);                                                    \
-} while(0);
+} while(0)
  
 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
 do {                                                                             \
@@ -507,7 +507,7 @@ do {
     hashv ^= hashv << 25;                                                        \
     hashv += hashv >> 6;                                                         \
     bkt = hashv & (num_bkts-1);                                                  \
-} while(0);
+} while(0)
 
 #ifdef HASH_USING_NO_STRICT_ALIASING
 /* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.

+ 1 - 1
src/core/dependencies/data_arbiter_concurrency.c

@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 2 - 2
src/core/dependencies/data_concurrency.c

@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 6 - 2
src/core/dependencies/implicit_data_deps.c

@@ -225,8 +225,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 
-		if (mode & STARPU_R)
-			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+		if (mode & STARPU_R && !handle->initialized)
+		{
+			STARPU_ASSERT_MSG(handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+			/* The task will initialize it with init_cl */
+			handle->initialized = 1;
+		}
 
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		{

+ 1 - 2
src/core/jobs.c

@@ -288,8 +288,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	{
 		unsigned long jobs = STARPU_ATOMIC_ADDL(&njobs_finished, 1);
 
-		printf("\r%lu tasks finished...", jobs);
-		fflush(stdout);
+		fprintf(stderr,"\r%lu tasks finished (last %lu %p)...", jobs, j->job_id, j->task);
 	}
 
 	struct starpu_task *task = j->task;

+ 18 - 11
src/core/perfmodel/energy_model.c

@@ -43,7 +43,7 @@
 #endif
 #endif
 
-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
+#define ERROR_RETURN(retval, function) do { PAPI_perror(function); fprintf(stderr, "Error %d %s:line %d\n", retval,__FILE__,__LINE__);  return(retval); } while (0)
 
 #if 0
 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
@@ -52,6 +52,7 @@
 #endif
 
 #ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
 static const int N_EVTS = 2;
 
 static int nsockets;
@@ -68,7 +69,7 @@ static int add_event(int EventSet, int socket);
 
 /*must be initialized to PAPI_NULL before calling PAPI_create_event*/
 static int EventSet = PAPI_NULL;
-
+#endif
 #endif
 
 static double t1;
@@ -80,7 +81,7 @@ static nvmlDevice_t device;
 #endif
 #endif
 
-int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
+int starpu_energy_start(int workerid STARPU_ATTRIBUTE_UNUSED, enum starpu_worker_archtype archi)
 {
 	t1 = starpu_timing_now();
 
@@ -100,11 +101,11 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
 
 		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_library_init");
 
 		/* Creating the eventset */
 		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_create_eventset");
 
 		int i;
 		for (i = 0 ; i < nsockets ; i ++ )
@@ -112,19 +113,25 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 			/* return the index of socket */
 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
 			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
-				ERROR_RETURN(retval);
+			{
+				if (retval == PAPI_EPERM)
+					_STARPU_DISP("PAPI could not access counters due to permissions errors. Perhaps your system requires to run measurements as root?\n");
+				else if (retval == PAPI_ENOEVNT)
+					_STARPU_DISP("PAPI could not access counters. Perhaps your system requires to run measurements as root?\n");
+				ERROR_RETURN(retval, "PAPI_add_named_event");
+			}
 		}
 
 		/* get the number of events in the event set */
 		number = 0;
 		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_list_events");
 
 		debug("There are %d events in the event set\n", number);
 
 		/* Start counting */
 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_start");
 
 		return retval;
 	}
@@ -180,7 +187,7 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
 		/* Stop counting and store the values into the array */
 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_stop");
 
 		int k,s;
 
@@ -199,11 +206,11 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
 		/*removes all events from a PAPI event set */
 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_cleanup_eventset");
 
 		/*deallocates the memory associated with an empty PAPI EventSet*/
 		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_destroy_eventset");
 
 		break;
 	}

+ 8 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -1328,7 +1328,7 @@ static void write_bus_latency_file_content(void)
 
 	_STARPU_DEBUG("writing latencies to %s\n", path);
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	if (!f)
 	{
 		perror("fopen write_bus_latency_file_content");
@@ -1337,6 +1337,7 @@ static void write_bus_latency_file_content(void)
 		STARPU_ABORT();
 	}
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 
 	fprintf(f, "# ");
@@ -1684,10 +1685,11 @@ static void write_bus_bandwidth_file_content(void)
 
 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 
 	fprintf(f, "# ");
@@ -2124,9 +2126,10 @@ static void write_bus_config_file_content(void)
 
 	_STARPU_DEBUG("writing config to %s\n", path);
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 
 	fprintf(f, "# Current configuration\n");
@@ -2655,7 +2658,7 @@ static void write_bus_platform_file_content(int version)
 
 	_STARPU_DEBUG("writing platform to %s\n", path);
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	if (!f)
 	{
 		perror("fopen write_bus_platform_file_content");
@@ -2664,6 +2667,7 @@ static void write_bus_platform_file_content(int version)
 		STARPU_ABORT();
 	}
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 
 	fprintf(f,

+ 4 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1177,11 +1177,12 @@ void starpu_save_history_based_model(struct starpu_perfmodel *model)
 
 	/* overwrite existing file, or create it */
 	FILE *f;
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
 	locked = _starpu_fwrlock(f) == 0;
 	check_model(model);
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	dump_model_file(f, model);
 	if (locked)
@@ -1610,10 +1611,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 	}
 
 	regmodel = &model->state->per_arch[comb][nimpl].regression;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 
 docal:
 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
@@ -1654,8 +1655,8 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	{
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 	}
 	else
 	{

+ 1 - 20
src/core/sched_policy.c

@@ -206,7 +206,7 @@ struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_c
 	if (selected_policy)
 		return selected_policy;
 
-	/* If no policy was specified, we use the eager policy by default */
+	/* If no policy was specified, we use the lws policy by default */
 	return &_starpu_sched_lws_policy;
 }
 
@@ -1153,25 +1153,6 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	}
 }
 
-void _starpu_wait_on_sched_event(void)
-{
-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
-
-	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
-
-	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
-
-	if (_starpu_machine_is_running())
-	{
-#ifndef STARPU_NON_BLOCKING_DRIVERS
-		STARPU_PTHREAD_COND_WAIT(&worker->sched_cond,
-					  &worker->sched_mutex);
-#endif
-	}
-
-	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
-}
-
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back STARPU_ATTRIBUTE_UNUSED)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);

+ 0 - 2
src/core/sched_policy.h

@@ -63,8 +63,6 @@ struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 int _starpu_pop_task_end(struct starpu_task *task);
 
-void _starpu_wait_on_sched_event(void);
-
 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 						   unsigned int node) STARPU_ATTRIBUTE_MALLOC;
 

+ 10 - 0
src/core/workers.c

@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	/* Do not start performance counter collection by default */
 	conf->start_perf_counter_collection = 0;
+
+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
 	return 0;
 }
 
@@ -1531,6 +1533,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		_STARPU_DISP("Warning: STARPU_ENABLE_STATS is enabled, which slows down a bit\n");
 	}
 
+#ifndef STARPU_SIMGRID
+	if (starpu_get_env_number_default("STARPU_SIMGRID", 0))
+	{
+		_STARPU_DISP("Simulation mode requested, but this libstarpu was built without simgrid support, please recompile\n");
+		return -EINVAL;
+	}
+#endif
+
 #if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);

+ 78 - 61
src/datawizard/coherency.c

@@ -179,7 +179,6 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
-	requesting_replicate->requested &= ~(1UL << requesting_node);
 
 	if (mode & STARPU_W)
 	{
@@ -406,16 +405,18 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, struct starpu_task *task, enum starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_data_request *r;
 
-	r = replicate->request[node];
-
-	if (r)
+	for (r = replicate->request[node]; r; r = r->next_same_req)
 	{
 		_starpu_spin_checklocked(&r->handle->header_lock);
 
+		if (task && r->task && task != r->task)
+			/* Do not collapse requests for different tasks */
+			continue;
+
 		_starpu_spin_lock(&r->lock);
 
                 /* perhaps we need to "upgrade" the request */
@@ -440,9 +441,12 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 		if (mode & STARPU_W)
 			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
+
+		/* We collapse with this request */
+		return r;
 	}
 
-	return r;
+	return NULL;
 }
 
 
@@ -469,7 +473,9 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
+								  enum starpu_data_access_mode mode,
+								  struct starpu_task *task,
+								  enum starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
@@ -493,8 +499,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		unsigned nnodes = starpu_memory_nodes_get_count();
 		for (i = 0; i < nnodes; i++)
 			for (j = 0; j < nnodes; j++)
-				if (handle->per_node[i].request[j])
+			{
+				struct _starpu_data_request *r;
+				for (r = handle->per_node[i].request[j]; r; r = r->next_same_req)
 					nwait++;
+			}
 		/* If the request is not detached (i.e. the caller really wants
 		 * proper ownership), no new requests will appear because a
 		 * reference will be kept on the dst replicate, which will
@@ -531,6 +540,25 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
 			}
+
+			if (task)
+			{
+				unsigned j;
+				unsigned nnodes = starpu_memory_nodes_get_count();
+				/* Cancel any existing (prefetch) request */
+				struct _starpu_data_request *r2;
+				for (j = 0; j < nnodes; j++)
+				{
+					for (r2 = dst_replicate->request[j]; r2; r2 = r2->next_same_req)
+					{
+						if (r2->task && r2->task == task)
+						{
+							r2->canceled = 1;
+							break;
+						}
+					}
+				}
+			}
 		}
 
 		_starpu_spin_unlock(&handle->header_lock);
@@ -568,11 +596,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
-		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
+			&& !_starpu_malloc_willpin_on_node(requesting_node))
 		{
-			/* And this is the main RAM, really no need for a
-			 * request, just allocate */
-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
+			/* And this is the main RAM without pinning, really no need for a
+			 * request, just quickly allocate and be done */
+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
 				if (dst_replicate->mc)
@@ -629,9 +658,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
 
 		/* Try to reuse a request if possible */
+#ifdef STARPU_DEVEL
+#warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
+#endif
 		r = _starpu_search_existing_data_request(hop_dst_replicate,
 				(mode & STARPU_R)?hop_src_node:hop_dst_node,
-							 mode, is_prefetch);
+							 mode, task, is_prefetch);
 
 		reused_requests[hop] = !!r;
 
@@ -640,7 +672,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			/* Create a new request if there was no request to reuse */
 			r = _starpu_create_data_request(handle, hop_src_replicate,
 							hop_dst_replicate, hop_handling_node,
-							mode, ndeps, is_prefetch, prio, 0, origin);
+							mode, ndeps, task, is_prefetch, prio, 0, origin);
 			nwait++;
 		}
 
@@ -686,7 +718,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		 */
 		struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
 							dst_replicate, requesting_node,
-							STARPU_W, nwait, is_prefetch, prio, 1, origin);
+							STARPU_W, nwait, task, is_prefetch, prio, 1, origin);
 
 		/* and perform the callback after termination */
 		_starpu_data_request_append_callback(r, callback_func, callback_arg);
@@ -701,8 +733,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		for (i = 0; i < nnodes; i++)
 			for (j = 0; j < nnodes; j++)
 			{
-				struct _starpu_data_request *r2 = handle->per_node[i].request[j];
-				if (r2)
+				struct _starpu_data_request *r2;
+				for (r2 = handle->per_node[i].request[j]; r2; r2 = r2->next_same_req)
 				{
 					_starpu_spin_lock(&r2->lock);
 					if (is_prefetch < r2->prefetch)
@@ -736,7 +768,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached,
+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
         _STARPU_LOG_IN();
@@ -745,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -790,7 +823,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
 	struct _starpu_data_request *r;
 	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
-						 is_prefetch, async, callback_func, callback_arg, prio, origin);
+						 task, is_prefetch, async, callback_func, callback_arg, prio, origin);
 
 	/* If no request was created, the handle was already up-to-date on the
 	 * node. In this case, _starpu_create_request_to_fetch_data has already
@@ -805,24 +838,24 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
         return ret;
 }
 
-static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
 }
 
-static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
 }
 
-static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
 }
 
-static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, task, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
 }
 
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
@@ -861,8 +894,15 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 	uint32_t wt_mask;
+	size_t max_wt_mask = sizeof(wt_mask) * 8;
+	unsigned wt_count = starpu_memory_nodes_get_count();
+	if (max_wt_mask > STARPU_MAXNODES)
+		max_wt_mask = STARPU_MAXNODES;
+	if (wt_count > max_wt_mask)
+		wt_count = max_wt_mask;
+
 	wt_mask = default_wt_mask | handle->wt_mask;
-	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
+	wt_mask &= (1ULL<<max_wt_mask)-1;
 
 	/* Note that it is possible that there is no valid copy of the data (if
 	 * starpu_data_invalidate was called for instance). In that case, we do
@@ -871,14 +911,14 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	unsigned memory_node = replicate->memory_node;
 
 	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
-	if (wt_mask & ~(1<<memory_node))
+	if (wt_mask && (memory_node >= max_wt_mask || wt_mask & ~(1<<memory_node)))
 		_starpu_write_through_data(handle, memory_node, wt_mask);
 
 	int cpt = 0;
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -897,26 +937,6 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
-static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
-{
-	int cpt = 0;
-	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
-	{
-		cpt++;
-		_starpu_datawizard_progress(1);
-	}
-	if (cpt == STARPU_SPIN_MAXTRY)
-		_starpu_spin_lock(&handle->header_lock);
-
-	if (replicate->state == STARPU_INVALID)
-	{
-		unsigned dst_node = replicate->memory_node;
-		replicate->requested |= 1UL << dst_node;
-	}
-
-	_starpu_spin_unlock(&handle->header_lock);
-}
-
 int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
 {
 #ifdef STARPU_OPENMP
@@ -945,12 +965,9 @@ int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node,
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		if (prefetch == STARPU_PREFETCH)
-		{
-			task_prefetch_data_on_node(handle, node, replicate, mode, prio);
-			_starpu_set_data_requested_flag_if_needed(handle, replicate);
-		}
+			task_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
 		else
-			idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
+			idle_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
 	}
 
 	if (prefetch == STARPU_PREFETCH)
@@ -1117,8 +1134,8 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
 		if (async)
 		{
-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
-					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1,
+					_starpu_fetch_task_input_cb, worker, task->priority, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
 				starpu_sleep(0.000001);
@@ -1133,7 +1150,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 		}
 		else
 		{
-			ret = fetch_data(handle, node, local_replicate, mode, 0);
+			ret = fetch_data(handle, node, local_replicate, mode, task, task->priority);
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
 				starpu_sleep(0.000001);
@@ -1371,7 +1388,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
 		local_replicate = get_replicate(handle, mode, -1, node);
 
-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
 	}
 
 	if (profiling && task->profiling_info)
@@ -1421,7 +1438,7 @@ unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
 
 		for (i = 0; i < nnodes; i++)
 		{
-			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
+			if (handle->per_node[node].request[i])
 				ret = 1;
 		}
 

+ 10 - 10
src/datawizard/coherency.h

@@ -72,15 +72,13 @@ struct _starpu_data_replicate
 	 * */
 	unsigned automatically_allocated:1;
 
-	/** To help the scheduling policies to make some decision, we
-	   may keep a track of the tasks that are likely to request
-	   this data on the current node.
-	   It is the responsability of the scheduling _policy_ to set that
-	   flag when it assigns a task to a queue, policies which do not
-	   use this hint can simply ignore it.
-	 */
-	uint32_t requested;
+	/** This tracks the list of requests to provide the value */
 	struct _starpu_data_request *request[STARPU_MAXNODES];
+	/** This points to the last entry of request, to easily append to the list */
+	struct _starpu_data_request *last_request[STARPU_MAXNODES];
+
+	/* Which request is loading data here */
+	struct _starpu_data_request *load_request;
 
 	/** The number of prefetches that we made for this replicate for various tasks
 	 * This is also the number of tasks that we will wait to see use the mc before
@@ -322,7 +320,8 @@ struct _starpu_data_state
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached,
+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -369,7 +368,8 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
+								  enum starpu_data_access_mode mode,
+								  struct starpu_task *task, enum starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 

+ 3 - 3
src/datawizard/copy_driver.c

@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									struct _starpu_data_replicate *dst_replicate,
 									unsigned donotread,
 									struct _starpu_data_request *req,
-									unsigned may_alloc,
+									enum _starpu_may_alloc may_alloc,
 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 	if (!donotread)
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 	/* first make sure the destination has an allocated buffer */
 	if (!dst_replicate->allocated)
 	{
-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
 		if (ret_alloc)
 			return -ENOMEM;
 	}

+ 8 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,13 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 
+enum _starpu_may_alloc
+{
+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
+	STARPU_DATAWIZARD_DO_ALLOC,
+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
+};
+
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    struct _starpu_data_replicate *dst_replicate,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
-				    unsigned may_alloc,
+				    enum _starpu_may_alloc may_alloc,
 				    enum starpu_is_prefetch prefetch);
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);

+ 306 - 179
src/datawizard/data_request.c

@@ -25,57 +25,67 @@
 #include <core/simgrid.h>
 
 /* requests that have not been treated at all */
-#ifdef STARPU_DEVEL
-#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
-#endif
-static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
-static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
-static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES][STARPU_MAXNODES][2]; /* Contains both task_prefetch and prefetch */
+static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
 
 /* requests that are not terminated (eg. async transfers) */
-static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES];
-static unsigned data_requests_npending[STARPU_MAXNODES];
-static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static unsigned data_requests_npending[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
 
 void _starpu_init_data_request_lists(void)
 {
-	unsigned i;
+	unsigned i, j;
+	enum _starpu_data_request_inout k;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		_starpu_data_request_prio_list_init(&data_requests[i]);
-		_starpu_data_request_prio_list_init(&prefetch_requests[i]);
-		_starpu_data_request_prio_list_init(&idle_requests[i]);
+		for (j = 0; j < STARPU_MAXNODES; j++)
+		{
+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
+			{
+				_starpu_data_request_prio_list_init(&data_requests[i][j][k]);
+				_starpu_data_request_prio_list_init(&prefetch_requests[i][j][k]);
+				_starpu_data_request_prio_list_init(&idle_requests[i][j][k]);
 
 #ifndef STARPU_DEBUG
-		/* Tell helgrind that we are fine with checking for list_empty
-		 * in _starpu_handle_node_data_requests, we will call it
-		 * periodically anyway */
-		STARPU_HG_DISABLE_CHECKING(data_requests[i].tree.root);
-		STARPU_HG_DISABLE_CHECKING(prefetch_requests[i].tree.root);
-		STARPU_HG_DISABLE_CHECKING(idle_requests[i].tree.root);
+				/* Tell helgrind that we are fine with checking for list_empty
+				 * in _starpu_handle_node_data_requests, we will call it
+				 * periodically anyway */
+				STARPU_HG_DISABLE_CHECKING(data_requests[i][j][k].tree.root);
+				STARPU_HG_DISABLE_CHECKING(prefetch_requests[i][j][k].tree.root);
+				STARPU_HG_DISABLE_CHECKING(idle_requests[i][j][k].tree.root);
 #endif
+				_starpu_data_request_prio_list_init(&data_requests_pending[i][j][k]);
+				data_requests_npending[i][j][k] = 0;
 
-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
-
-		_starpu_data_request_prio_list_init(&data_requests_pending[i]);
-		data_requests_npending[i] = 0;
-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i][j][k], NULL);
+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i][j][k], NULL);
+			}
+		}
 	}
 	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
 }
 
 void _starpu_deinit_data_request_lists(void)
 {
-	unsigned i;
+	unsigned i, j;
+	enum _starpu_data_request_inout k;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
-		_starpu_data_request_prio_list_deinit(&data_requests[i]);
-		_starpu_data_request_prio_list_deinit(&prefetch_requests[i]);
-		_starpu_data_request_prio_list_deinit(&idle_requests[i]);
-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
-		_starpu_data_request_prio_list_deinit(&data_requests_pending[i]);
-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
+		for (j = 0; j < STARPU_MAXNODES; j++)
+		{
+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
+			{
+				_starpu_data_request_prio_list_deinit(&data_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&prefetch_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&idle_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&data_requests_pending[i][j][k]);
+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i][j][k]);
+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i][j][k]);
+			}
+		}
 	}
 }
 
@@ -92,23 +102,39 @@ static void _starpu_data_request_unlink(struct _starpu_data_request *r)
 		STARPU_ASSERT(r->mode == STARPU_W);
 		r->handle->write_invalidation_req = NULL;
 	}
-	else if (r->mode & STARPU_R)
-	{
-		/* If this is a read request, we store the pending requests
-		 * between src and dst. */
-		unsigned node = r->src_replicate->memory_node;
-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
-		r->dst_replicate->request[node] = NULL;
-	}
 	else
 	{
-		/* If this is a write only request, then there is no source and
-		 * we use the destination node to cache the request. */
-		unsigned node = r->dst_replicate->memory_node;
-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
-		r->dst_replicate->request[node] = NULL;
-	}
+		unsigned node;
+		struct _starpu_data_request **prevp, *prev;
+
+		if (r->mode & STARPU_R)
+			/* If this is a read request, we store the pending requests
+			 * between src and dst. */
+			node = r->src_replicate->memory_node;
+		else
+			/* If this is a write only request, then there is no source and
+			 * we use the destination node to cache the request. */
+			node = r->dst_replicate->memory_node;
+
+		/* Look for ourself in the list, we should be not very far. */
+		for (prevp = &r->dst_replicate->request[node], prev = NULL;
+		     *prevp && *prevp != r;
+		     prev = *prevp, prevp = &prev->next_same_req)
+			;
 
+		STARPU_ASSERT(*prevp == r);
+		*prevp = r->next_same_req;
+
+		if (!r->next_same_req)
+		{
+			/* I was last */
+			STARPU_ASSERT(r->dst_replicate->last_request[node] == r);
+			if (prev)
+				r->dst_replicate->last_request[node] = prev;
+			else
+				r->dst_replicate->last_request[node] = NULL;
+		}
+	}
 }
 
 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
@@ -124,6 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
+							 struct starpu_task *task,
 							 enum starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
@@ -135,7 +162,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
 	_starpu_spin_init(&r->lock);
 
-	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch);
+	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch, r);
 
 	r->origin = origin;
 	r->handle = handle;
@@ -153,22 +180,48 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
+	if (is_write_invalidation)
+	{
+		r->peer_node = handling_node;
+		r->inout = _STARPU_DATA_REQUEST_IN;
+	}
+	else if (dst_replicate->memory_node == handling_node)
+	{
+		if (src_replicate)
+			r->peer_node = src_replicate->memory_node;
+		else
+			r->peer_node = handling_node;
+		r->inout = _STARPU_DATA_REQUEST_IN;
+	}
+	else
+	{
+		r->peer_node = dst_replicate->memory_node;
+		r->inout = _STARPU_DATA_REQUEST_OUT;
+	}
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
+	r->added_ref = 0;
+	r->canceled = 0;
 	r->prefetch = is_prefetch;
+	r->task = task;
 	r->nb_tasks_prefetch = 0;
 	r->prio = prio;
 	r->retval = -1;
 	r->ndeps = ndeps;
+	r->next_same_req = NULL;
 	r->next_req_count = 0;
 	r->callbacks = NULL;
 	r->com_id = 0;
 
 	_starpu_spin_lock(&r->lock);
 
-	/* Take a reference on the target for the request to be able to write it */
-	if (dst_replicate)
+	/* For a fetch, take a reference as soon as now on the target, to avoid
+	 * replicate eviction */
+	if (is_prefetch == STARPU_FETCH && dst_replicate)
+	{
+		r->added_ref = 1;
 		dst_replicate->refcnt++;
+	}
 	handle->busy_count++;
 
 	if (is_write_invalidation)
@@ -176,20 +229,28 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 		STARPU_ASSERT(!handle->write_invalidation_req);
 		handle->write_invalidation_req = r;
 	}
-	else if (mode & STARPU_R)
-	{
-		unsigned src_node = src_replicate->memory_node;
-		STARPU_ASSERT(!dst_replicate->request[src_node]);
-		dst_replicate->request[src_node] = r;
-		/* Take a reference on the source for the request to be able to read it */
-		src_replicate->refcnt++;
-		handle->busy_count++;
-	}
 	else
 	{
-		unsigned dst_node = dst_replicate->memory_node;
-		STARPU_ASSERT(!dst_replicate->request[dst_node]);
-		dst_replicate->request[dst_node] = r;
+		unsigned node;
+
+		if (mode & STARPU_R)
+			node = src_replicate->memory_node;
+		else
+			node = dst_replicate->memory_node;
+
+		if (!dst_replicate->request[node])
+			dst_replicate->request[node] = r;
+		else
+			dst_replicate->last_request[node]->next_same_req = r;
+		dst_replicate->last_request[node] = r;
+
+		if (mode & STARPU_R)
+		{
+			/* Take a reference on the source for the request to be
+			 * able to read it */
+			src_replicate->refcnt++;
+			handle->busy_count++;
+		}
 	}
 
 	r->refcnt = 1;
@@ -199,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	return r;
 }
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 	int retval;
 	int do_delete = 0;
@@ -310,14 +371,14 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 	}
 
 	/* insert the request in the proper list */
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
 	if (r->prefetch >= STARPU_IDLEFETCH)
-		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
+		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node][r->peer_node][r->inout], r);
 	else if (r->prefetch > STARPU_FETCH)
-		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
+		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
 	else
-		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
+		_starpu_data_request_prio_list_push_back(&data_requests[handling_node][r->peer_node][r->inout], r);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	_starpu_wake_all_blocked_workers_on_node(handling_node);
@@ -352,7 +413,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
 
-	if (dst_replicate)
+	if (r->canceled < 2 && dst_replicate)
 	{
 #ifdef STARPU_MEMORY_STATS
 		enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
@@ -360,6 +421,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
 		_starpu_spin_checklocked(&handle->header_lock);
 		_starpu_update_data_state(handle, r->dst_replicate, mode);
+		dst_replicate->load_request = NULL;
 
 #ifdef STARPU_MEMORY_STATS
 		if (src_replicate->state == STARPU_INVALID)
@@ -382,7 +444,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 #endif
 	}
 
-	if (r->com_id > 0)
+	if (r->canceled < 2 && r->com_id > 0)
 	{
 #ifdef STARPU_USE_FXT
 		unsigned src_node = src_replicate->memory_node;
@@ -414,12 +476,15 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	/* Remove a reference on the destination replicate for the request */
 	if (dst_replicate)
 	{
-		if (dst_replicate->mc)
+		if (r->canceled < 2 && dst_replicate->mc)
 			/* Make sure it stays there for the task.  */
 			dst_replicate->nb_tasks_prefetch += r->nb_tasks_prefetch;
 
-		STARPU_ASSERT(dst_replicate->refcnt > 0);
-		dst_replicate->refcnt--;
+		if (r->added_ref)
+		{
+			STARPU_ASSERT(dst_replicate->refcnt > 0);
+			dst_replicate->refcnt--;
+		}
 	}
 	STARPU_ASSERT(handle->busy_count > 0);
 	handle->busy_count--;
@@ -467,8 +532,16 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	}
 }
 
+void _starpu_data_request_complete_wait(void *arg)
+{
+	struct _starpu_data_request *r = arg;
+	_starpu_spin_lock(&r->handle->header_lock);
+	_starpu_spin_lock(&r->lock);
+	starpu_handle_data_request_completion(r);
+}
+
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum starpu_is_prefetch prefetch)
+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 	starpu_data_handle_t handle = r->handle;
 
@@ -491,12 +564,50 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
+	if (r->canceled)
+	{
+		/* Ok, canceled before starting copies etc. */
+		r->canceled = 2;
+		/* Nothing left to do */
+		starpu_handle_data_request_completion(r);
+		return 0;
+	}
+
+	if (dst_replicate)
+	{
+		struct _starpu_data_request *r2 = dst_replicate->load_request;
+		if (r2 && r2 != r)
+		{
+			/* Oh, some other transfer is already loading the value. Just wait for it */
+			r->canceled = 2;
+			_starpu_spin_unlock(&r->lock);
+			_starpu_spin_lock(&r2->lock);
+			_starpu_data_request_append_callback(r2, _starpu_data_request_complete_wait, r);
+			_starpu_spin_unlock(&r2->lock);
+			_starpu_spin_unlock(&handle->header_lock);
+			return 0;
+		}
+
+		/* We are loading this replicate.
+		 * Note: we might fail to allocate memory, but we will keep on and others will wait for us. */
+		dst_replicate->load_request = r;
+	}
+
 	enum starpu_data_access_mode r_mode = r->mode;
 
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
 
+	/* For prefetches, we take a reference on the destination only now that
+	 * we will really try to fetch the data (instead of in
+	 * _starpu_create_data_request) */
+	if (dst_replicate && r->prefetch > STARPU_FETCH)
+	{
+		r->added_ref = 1;	/* Note: we might get upgraded while trying to allocate */
+		dst_replicate->refcnt++;
+	}
+
 	_starpu_spin_unlock(&r->lock);
 
 	/* FIXME: the request may get upgraded from here to freeing it... */
@@ -507,7 +618,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
 	if (dst_replicate && dst_replicate->state == STARPU_INVALID)
 		r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
-						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, prefetch);
+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, r->prefetch);
 	else
 		/* Already valid actually, no need to transfer anything */
 		r->retval = 0;
@@ -516,6 +627,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	{
 		/* If there was not enough memory, we will try to redo the
 		 * request later. */
+
+		if (r->prefetch > STARPU_FETCH)
+		{
+			STARPU_ASSERT(r->added_ref);
+			/* Drop ref until next try */
+			r->added_ref = 0;
+			dst_replicate->refcnt--;
+		}
+
 		_starpu_spin_unlock(&handle->header_lock);
 		return -ENOMEM;
 	}
@@ -528,10 +648,10 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 		 * requests in the meantime. */
 		_starpu_spin_unlock(&handle->header_lock);
 
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
-		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node], r);
-		data_requests_npending[r->handling_node]++;
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
+		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node][r->peer_node][r->inout], r);
+		data_requests_npending[r->handling_node][r->peer_node][r->inout]++;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 		return -EAGAIN;
 	}
@@ -543,10 +663,9 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
 {
 	struct _starpu_data_request *r;
-	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
 	unsigned i;
 	int ret = 0;
 
@@ -556,48 +675,55 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	/* This is racy, but not posing problems actually, since we know we
 	 * will come back here to probe again regularly anyway.
 	 * Thus, do not expose this optimization to helgrind */
-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[src_node]))
+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
 		return 0;
 #endif
 
-	/* TODO optimize */
+	/* We create a new list to pickup some requests from the main list, and
+	 * we handle the request(s) one by one from it, without concurrency issues.
+	 */
+	struct _starpu_data_request_list local_list, remain_list;
+	_starpu_data_request_list_init(&local_list);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	/* take all the entries from the request list */
-	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]))
 	{
 		/* List is busy, do not bother with it */
 		return -EBUSY;
 	}
 #else
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 #endif
 
-	if (_starpu_data_request_prio_list_empty(&reqlist[src_node]))
+	for (i = data_requests_npending[handling_node][peer_node][inout];
+		i < n && ! _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]);
+		i++)
 	{
-		/* there is no request */
-                STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
-		return 0;
+		r = _starpu_data_request_prio_list_pop_front_highest(&reqlist[handling_node][peer_node][inout]);
+		_starpu_data_request_list_push_back(&local_list, r);
 	}
 
-	/* There is an entry: we create a new empty list to replace the list of
-	 * requests, and we handle the request(s) one by one in the former
-	 * list, without concurrency issues.*/
-	struct _starpu_data_request_prio_list local_list = reqlist[src_node];
-	_starpu_data_request_prio_list_init(&reqlist[src_node]);
+	if (!_starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
+		/* We have left some requests */
+		ret = -EBUSY;
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+	if (_starpu_data_request_list_empty(&local_list))
+		/* there is no request */
+		return 0;
 
-	for (i = 0; i <= prefetch; i++)
-		_starpu_data_request_prio_list_init(&new_data_requests[i]);
+	/* This will contain the remaining requests */
+	_starpu_data_request_list_init(&remain_list);
 
 	double start = starpu_timing_now();
 	/* for all entries of the list */
-	while (!_starpu_data_request_prio_list_empty(&local_list))
+	while (!_starpu_data_request_list_empty(&local_list))
 	{
                 int res;
 
-		if (data_requests_npending[src_node] >= n)
+		if (data_requests_npending[handling_node][peer_node][inout] >= n)
 		{
 			/* Too many requests at the same time, skip pushing
 			 * more for now */
@@ -605,21 +731,22 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			break;
 		}
 
-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
+		r = _starpu_data_request_list_pop_front(&local_list);
 
-		res = starpu_handle_data_request(r, may_alloc, prefetch);
+		res = starpu_handle_data_request(r, may_alloc);
 		if (res != 0 && res != -EAGAIN)
 		{
 			/* handle is busy, or not enough memory, postpone for now */
 			ret = res;
 			/* Prefetch requests might have gotten promoted while in tmp list */
-			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
+			_starpu_data_request_list_push_back(&remain_list, r);
 			if (prefetch > STARPU_FETCH)
 				/* Prefetching more there would make the situation even worse */
 				break;
 		}
+		else
+			(*pushed)++;
 
-		(*pushed)++;
 		if (starpu_timing_now() - start >= MAX_PUSH_TIME)
 		{
 			/* We have spent a lot of time doing requests, skip pushing more for now */
@@ -628,43 +755,23 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 		}
 	}
 
-	/* Push back requests we didn't handle on the proper list */
-	while (!_starpu_data_request_prio_list_empty(&local_list))
-	{
-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
-		/* Prefetch requests might have gotten promoted while in tmp list */
-		_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
-	}
-	_starpu_data_request_prio_list_deinit(&local_list);
-
-	for (i = 0; i <= prefetch; i++)
-		if (!_starpu_data_request_prio_list_empty(&new_data_requests[i]))
-			break;
+	/* Gather remainder */
+	_starpu_data_request_list_push_list_back(&remain_list, &local_list);
 
-	if (i <= prefetch)
+	if (!_starpu_data_request_list_empty(&remain_list))
 	{
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
-			data_requests[src_node] = new_data_requests[STARPU_FETCH];
-		}
-		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
+		while (!_starpu_data_request_list_empty(&remain_list))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
-		}
-		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
-		}
-		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
-			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
+			r = _starpu_data_request_list_pop_back(&remain_list);
+			if (r->prefetch >= STARPU_IDLEFETCH)
+				_starpu_data_request_prio_list_push_front(&idle_requests[handling_node][r->peer_node][r->inout], r);
+			else if (r->prefetch > STARPU_FETCH)
+				_starpu_data_request_prio_list_push_front(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
+			else
+				_starpu_data_request_prio_list_push_front(&data_requests[handling_node][r->peer_node][r->inout], r);
 		}
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 
 #ifdef STARPU_SIMGRID
 		if (*pushed)
@@ -676,32 +783,32 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			 * for eviction to happen.
 			 */
 			starpu_sleep(0.000001);
-			_starpu_wake_all_blocked_workers_on_node(src_node);
+			_starpu_wake_all_blocked_workers_on_node(handling_node);
 		}
 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)
-		_starpu_wake_all_blocked_workers_on_node(src_node);
+		_starpu_wake_all_blocked_workers_on_node(handling_node);
 #endif
 	}
 
 	return ret;
 }
 
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
+	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 
-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
+	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 
-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
+	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }
 
-static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
+static int _handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned force)
 {
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
@@ -712,14 +819,14 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 	/* Here helgrind would should that this is an un protected access.
 	 * We however don't care about missing an entry, we will get called
 	 * again sooner or later. */
-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
 		return 0;
 #endif
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (!force)
 	{
-		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]))
+		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]))
 		{
 			/* List is busy, do not bother with it */
 			return 0;
@@ -728,19 +835,19 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 	else
 #endif
 		/* We really want to handle requests */
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
-	if (_starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
+	if (_starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
 	{
 		/* there is no request */
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 		return 0;
 	}
 	/* for all entries of the list */
-	struct _starpu_data_request_prio_list local_list = data_requests_pending[src_node];
-	_starpu_data_request_prio_list_init(&data_requests_pending[src_node]);
+	struct _starpu_data_request_prio_list local_list = data_requests_pending[handling_node][peer_node][inout];
+	_starpu_data_request_prio_list_init(&data_requests_pending[handling_node][peer_node][inout]);
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
 	_starpu_data_request_prio_list_init(&new_data_requests_pending);
 	taken = 0;
@@ -803,55 +910,75 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 		}
 	}
 	_starpu_data_request_prio_list_deinit(&local_list);
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
-	data_requests_npending[src_node] -= taken - kept;
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
+	data_requests_npending[handling_node][peer_node][inout] -= taken - kept;
 	if (kept)
-		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[src_node], &new_data_requests_pending);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[handling_node][peer_node][inout], &new_data_requests_pending);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
 	return taken - kept;
 }
 
-int _starpu_handle_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
-	return _handle_pending_node_data_requests(src_node, 0);
+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 0);
 }
 
-int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
-	return _handle_pending_node_data_requests(src_node, 1);
+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 1);
 }
 
 /* Note: the returned value will be outdated since the locks are not taken at
  * entry/exit */
-int _starpu_check_that_no_data_request_exists(unsigned node)
+static int __starpu_check_that_no_data_request_exists(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
 	int no_request;
 	int no_pending;
 
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node]);
-	no_request = _starpu_data_request_prio_list_empty(&data_requests[node])
-	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node])
-		  && _starpu_data_request_prio_list_empty(&idle_requests[node]);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
-	no_pending = !data_requests_npending[node];
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node][peer_node][inout]);
+	no_request = _starpu_data_request_prio_list_empty(&data_requests[node][peer_node][inout])
+	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node][peer_node][inout])
+		  && _starpu_data_request_prio_list_empty(&idle_requests[node][peer_node][inout]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node][peer_node][inout]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
+	no_pending = !data_requests_npending[node][peer_node][inout];
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
 
 	return no_request && no_pending;
 }
 
+int _starpu_check_that_no_data_request_exists(unsigned node)
+{
+	unsigned peer_node, nnodes = starpu_memory_nodes_get_count();
+
+	for (peer_node = 0; peer_node < nnodes; peer_node++)
+		if (!__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_IN)
+		 || !__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_OUT))
+		 return 0;
+	 return 1;
+}
+
 /* Note: the returned value will be outdated since the locks are not taken at
  * entry/exit */
-int _starpu_check_that_no_data_request_is_pending(unsigned node)
+int _starpu_check_that_no_data_request_is_pending(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
-	return !data_requests_npending[node];
+	return !data_requests_npending[node][peer_node][inout];
 }
 
 
 void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_is_prefetch prefetch)
 {
+	_starpu_spin_checklocked(&r->handle->header_lock);
 	STARPU_ASSERT(r->prefetch > prefetch);
+
+	if (prefetch == STARPU_FETCH && !r->added_ref)
+	{
+		/* That would have been done by _starpu_create_data_request */
+		r->added_ref = 1;
+		r->dst_replicate->refcnt++;
+	}
+
 	r->prefetch=prefetch;
 
 	if (prefetch >= STARPU_IDLEFETCH)
@@ -867,27 +994,27 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_
 			_starpu_update_prefetch_status(next_req, prefetch);
 	}
 
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 	int found = 1;
 
 	/* The request can be in a different list (handling request or the temp list)
 	 * we have to check that it is really in the prefetch or idle list. */
-	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
-	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
+	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r))
+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r);
+	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node][r->peer_node][r->inout], r))
+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node][r->peer_node][r->inout], r);
 	else
 		found = 0;
 
 	if (found)
 	{
 		if (prefetch > STARPU_FETCH)
-			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
+			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node][r->peer_node][r->inout],r);
 		else
-			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
+			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node][r->peer_node][r->inout],r);
 	}
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	_starpu_wake_all_blocked_workers_on_node(r->handling_node);

+ 34 - 13
src/datawizard/data_request.h

@@ -32,8 +32,8 @@
  * Data interfaces should also have to declare how many asynchronous requests
  * they have actually started (think of e.g. csr).
  */
-#define MAX_PENDING_REQUESTS_PER_NODE 20
-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
+#define MAX_PENDING_REQUESTS_PER_NODE 5
+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 2
 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
 /** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
 #define MAX_PUSH_TIME 1000
@@ -47,6 +47,11 @@ struct _starpu_callback_list
 	struct _starpu_callback_list *next;
 };
 
+enum _starpu_data_request_inout
+{
+	_STARPU_DATA_REQUEST_IN, _STARPU_DATA_REQUEST_OUT
+};
+
 /** This represents a data request, i.e. we want some data to get transferred
  * from a source to a destination. */
 LIST_TYPE(_starpu_data_request,
@@ -63,6 +68,8 @@ LIST_TYPE(_starpu_data_request,
 	 * the node can make the CUDA/OpenCL calls.
 	 */
 	unsigned handling_node;
+	unsigned peer_node;
+	enum _starpu_data_request_inout inout;
 
 	/*
 	 * What the destination node wants to do with the data: write to it,
@@ -78,10 +85,19 @@ LIST_TYPE(_starpu_data_request,
 	struct _starpu_async_channel async_channel;
 
 	/** Whether the transfer is completed. */
-	unsigned completed;
+	unsigned completed:1;
+
+	/** Whether we have already added our reference to the dst replicate. */
+	unsigned added_ref:1;
+
+	/** Whether the request was canceled before being handled (because the transfer already happened another way). */
+	unsigned canceled:2;
 
 	/** Whether this is just a prefetch request */
-	enum starpu_is_prefetch prefetch;
+	enum starpu_is_prefetch prefetch:3;
+
+	/** Task this request is for */
+	struct starpu_task *task;
 
 	/** Number of tasks which used this as a prefetch */
 	unsigned nb_tasks_prefetch;
@@ -96,6 +112,10 @@ LIST_TYPE(_starpu_data_request,
 	 * dependencies. */
 	unsigned ndeps;
 
+	/** Some further tasks may have requested prefetches for the same data
+	 * much later on, link with them */
+	struct _starpu_data_request *next_same_req;
+
 	/** in case we have a chain of request (eg. for nvidia multi-GPU), this
 	 * is the list of requests which are waiting for this one. */
 	struct _starpu_data_request *next_req[STARPU_MAXNODES+1];
@@ -123,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
 	int prio;
 
-	/** if this is more complicated ... (eg. application request) 
+	/** if this is more complicated ... (eg. application request)
 	 * NB: this callback is not called with the lock taken !
 	 */
 	void (*ready_data_callback)(void *argcb);
@@ -135,15 +155,15 @@ void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r);
 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
 
-int _starpu_handle_pending_node_data_requests(unsigned src_node);
-int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
 
-int _starpu_check_that_no_data_request_exists(unsigned node);
-int _starpu_check_that_no_data_request_is_pending(unsigned node);
+int _starpu_check_that_no_data_request_exists(unsigned handling_node);
+int _starpu_check_that_no_data_request_is_pending(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
 
 struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
 							 struct _starpu_data_replicate *src_replicate,
@@ -151,12 +171,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
+							 struct starpu_task *task,
 							 enum starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
 
 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),

+ 87 - 25
src/datawizard/datawizard.c

@@ -26,19 +26,17 @@
 #include <core/simgrid.h>
 #endif
 
-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 	int ret = 0;
-
-#ifdef STARPU_SIMGRID
-	/* XXX */
-	starpu_sleep(0.000001);
-#endif
-	STARPU_UYIELD();
+	unsigned peer_node;
 
 	/* in case some other driver requested data */
-	if (_starpu_handle_pending_node_data_requests(memory_node))
-		ret = 1;
+	for (peer_node = peer_start; peer_node < peer_end; peer_node++)
+	{
+		if (_starpu_handle_pending_node_data_requests(memory_node, peer_node, inout))
+			ret = 1;
+	}
 
 	starpu_memchunk_tidy(memory_node);
 
@@ -46,26 +44,70 @@ int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsi
 	{
 		/* Some transfers have finished, or the driver requests to really push more */
 		unsigned pushed;
-		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
+		unsigned ok = 1;
+
+		for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
 		{
+			if (_starpu_handle_node_data_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+				ok = 0;
 			if (pushed)
 				ret = 1;
+		}
+
+		if (ok)
+		{
+			unsigned doidle = 1;
+
 			/* We pushed all pending requests, we can afford pushing
 			 * prefetch requests */
-			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
-			if (_starpu_check_that_no_data_request_is_pending(memory_node))
+			for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
+			{
+				if (_starpu_handle_node_prefetch_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+					ok = 0;
+				if (pushed)
+					ret = 1;
+				if (!_starpu_check_that_no_data_request_is_pending(memory_node, peer_node, inout))
+					doidle = 0;
+			}
+
+			if (doidle)
 				/* No pending transfer, push some idle transfer */
-				_starpu_handle_node_idle_requests(memory_node, may_alloc, &pushed);
+				for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
+				{
+					if (_starpu_handle_node_idle_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+						ok = 0;
+					if (pushed)
+						ret = 1;
+				}
 		}
-		if (pushed)
-			ret = 1;
+
 	}
-	_starpu_execute_registered_progression_hooks();
 
 	return ret;
 }
 
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
+{
+	int ret = 0;
+	unsigned peer_node;
+
+#ifdef STARPU_SIMGRID
+	/* XXX */
+	starpu_sleep(0.000001);
+#endif
+	STARPU_UYIELD();
+
+	/* First handle all incoming transfers */
+	ret |= ____starpu_datawizard_progress(memory_node, 0, nnodes, _STARPU_DATA_REQUEST_IN, may_alloc, push_requests);
+
+	/* Then handle outgoing transfers */
+	for (peer_node = 0; peer_node < nnodes; peer_node++)
+		ret |= ____starpu_datawizard_progress(memory_node, peer_node, peer_node+1, _STARPU_DATA_REQUEST_OUT, may_alloc, push_requests);
+
+	return ret;
+}
+
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
         unsigned memnode;
@@ -77,7 +119,8 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 		int nnumas = starpu_memory_nodes_get_numa_count();
 		int numa;
 		for (numa = 0; numa < nnumas; numa++)
-			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
+			ret |=  ___starpu_datawizard_progress(numa, nnumas, may_alloc, push_requests);
+		_starpu_execute_registered_progression_hooks();
 
 		return ret;
 	}
@@ -87,19 +130,38 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 		worker = &worker->set->workers[0];
 
 	unsigned current_worker_id = worker->workerid;
-        int ret = 0;
+	int ret = 0;
 	unsigned nnodes = starpu_memory_nodes_get_count();
 
-        for (memnode = 0; memnode < nnodes; memnode++)
-        {
-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
-                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
-        }
+	for (memnode = 0; memnode < nnodes; memnode++)
+	{
+		if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
+		{
+			if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
+			else
+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
+			}
+	}
+
+	_starpu_execute_registered_progression_hooks();
 
         return ret;
 }
 
-void _starpu_datawizard_progress(unsigned may_alloc)
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
 {
         __starpu_datawizard_progress(may_alloc, 1);
 }
+
+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode)
+{
+	unsigned nnodes = starpu_memory_nodes_get_count();
+	unsigned memnode2;
+
+	for (memnode2 = 0; memnode2 < nnodes; memnode2++)
+	{
+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_IN);
+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_OUT);
+	}
+}

+ 8 - 7
src/datawizard/datawizard.h

@@ -34,18 +34,19 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
-/** Make data transfers progress on node \p memory_node.
+
+/** Make data transfers progress on all memory nodes driven by the current worker.
  *
  * If \p push_requests is 1, it can start new transfers
  *
- * If \p may_alloc is 1, it can allocate destination data for transfers
+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
  * (this is not possible e.g. when spinning for a handle lock)
  */
-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
-/** Call ___starpu_datawizard_progress() for all memory nodes driven by the
- * current worker */
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
 /** Call __starpu_datawizard_progress with push_requests = 1 */
-void _starpu_datawizard_progress(unsigned may_alloc);
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
+
+/** Check for all pending data request progress on node \p memory_node */
+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);
 
 #endif // __DATAWIZARD_H__

+ 1 - 1
src/datawizard/filters.c

@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #endif

+ 5 - 2
src/datawizard/interfaces/data_interface.c

@@ -375,13 +375,14 @@ _starpu_data_initialize_per_worker(starpu_data_handle_t handle)
 		replicate->state = STARPU_INVALID;
 		//replicate->refcnt = 0;
 		replicate->handle = handle;
-		//replicate->requested = 0;
 		//replicate->nb_tasks_prefetch = 0;
 
 		//for (node = 0; node < STARPU_MAXNODES; node++)
 		//{
 		//	replicate->request[node] = NULL;
+		//	replicate->last_request[node] = NULL;
 		//}
+		//replicate->load_request = NULL;
 
 		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
 		replicate->relaxed_coherency = 1;
@@ -785,7 +786,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	}
 	if (valid)
 	{
-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
@@ -1033,6 +1034,7 @@ retry_busy:
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
+		STARPU_ASSERT(!local->refcnt);
 		if (local->allocated)
 		{
 			_starpu_data_unregister_ram_pointer(handle, node);
@@ -1049,6 +1051,7 @@ retry_busy:
 		for (worker = 0; worker < nworkers; worker++)
 		{
 			struct _starpu_data_replicate *local = &handle->per_worker[worker];
+			STARPU_ASSERT(!local->refcnt);
 			/* free the data copy in a lazy fashion */
 			if (local->allocated && local->automatically_allocated)
 				_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);

+ 10 - 0
src/datawizard/malloc.c

@@ -149,6 +149,15 @@ static int _starpu_malloc_should_pin(int flags)
 	return 0;
 }
 
+int _starpu_malloc_willpin_on_node(unsigned dst_node)
+{
+	int flags = malloc_on_node_default_flags[dst_node];
+	return (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0
+			&& (_starpu_can_submit_cuda_task()
+			    /* || _starpu_can_submit_opencl_task() */
+			));
+}
+
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
 {
 	int ret=0;
@@ -185,6 +194,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		goto end;
 	}
 
+	/* Note: synchronize this test with _starpu_malloc_willpin_on_node */
 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 		if (_starpu_can_submit_cuda_task())

+ 7 - 0
src/datawizard/malloc.h

@@ -26,4 +26,11 @@ void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
+
+/**
+   Returns whether when allocating data on \p dst_node, we will do pinning, i.e.
+   the allocation will be very expensive, and should thus be moved out from the
+   critical path
+  */
+int _starpu_malloc_willpin_on_node(unsigned dst_node);
 #endif

+ 21 - 11
src/datawizard/memalloc.c

@@ -169,7 +169,10 @@ void _starpu_mem_chunk_disk_register(unsigned disk_memnode)
 	{
 		enum starpu_node_kind kind = starpu_node_get_kind(i);
 		if (kind == STARPU_CPU_RAM)
+		{
+			STARPU_HG_DISABLE_CHECKING(evictable[i]);
 			evictable[i] = 1;
+		}
 	}
 }
 
@@ -327,7 +330,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 		{
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
 			/* There is no way we don't need a request, since
 			 * source is OWNER, destination can't be having it */
 			STARPU_ASSERT(r);
@@ -552,8 +555,9 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch)
 {
+	STARPU_ASSERT(node < STARPU_MAXNODES);
 	/* This data should be written through to this node, avoid dropping it! */
-	if (handle->wt_mask & (1<<node))
+	if (node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
 		return 0;
 
 	/* This data was registered from this node, we will not be able to drop it anyway */
@@ -1012,7 +1016,7 @@ restart2:
 				next_mc->remove_notify = &next_mc;
 			}
 			/* Note: this may unlock mc_list! */
-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, is_prefetch);
 
 			if (orig_next_mc)
 			{
@@ -1179,7 +1183,7 @@ void starpu_memchunk_tidy(unsigned node)
 			if (
 				/* This data should be written through to this node, avoid
 				 * dropping it! */
-				handle->wt_mask & (1<<node)
+				(node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
 				/* This is partitioned, don't care about the
 				 * whole data, we'll work on the subdatas.  */
 			     || handle->nchildren
@@ -1231,7 +1235,7 @@ void starpu_memchunk_tidy(unsigned node)
 			}
 
 			_starpu_spin_unlock(&mc_lock[node]);
-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, NULL, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
 			{
 				/* No request was actually needed??
 				 * Odd, but cope with it.  */
@@ -1442,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  */
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
@@ -1473,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	if (!prefetch_oom)
 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
 #endif
+
+	/* If this is RAM and pinned this will be slow
+	   In case we only want fast allocations return here */
+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
+		return -ENOMEM;
+
 	STARPU_ASSERT(handle->ops);
 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
 	STARPU_ASSERT(replicate->data_interface);
@@ -1576,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(0);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -1620,7 +1630,7 @@ out:
 	return allocated_memory;
 }
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 	starpu_ssize_t allocated_memory;
 
@@ -1635,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 		return 0;
 
 	STARPU_ASSERT(replicate->data_interface);
-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
 
 	/* perhaps we could really not handle that capacity misses */
 	if (allocated_memory == -ENOMEM)
@@ -1845,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 					target = i;
@@ -1877,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 					target = i;

+ 1 - 1
src/datawizard/memalloc.h

@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 3 - 4
src/datawizard/memory_nodes.c

@@ -151,6 +151,7 @@ void _starpu_memory_node_register_condition(struct _starpu_worker *worker, starp
 #undef starpu_worker_get_memory_node
 unsigned starpu_worker_get_memory_node(unsigned workerid)
 {
+	(void) workerid;
 	return _starpu_worker_get_memory_node(workerid);
 }
 
@@ -167,12 +168,10 @@ void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned m
 	}
 }
 
+#undef starpu_worker_get_local_memory_node
 unsigned starpu_worker_get_local_memory_node(void)
 {
-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
-	if (!worker)
-		return STARPU_MAIN_RAM;
-	return worker->memory_node;
+	return _starpu_worker_get_local_memory_node();
 }
 
 int starpu_memory_node_get_devid(unsigned node)

+ 21 - 0
src/datawizard/memory_nodes.h

@@ -117,12 +117,19 @@ static inline enum starpu_node_kind _starpu_node_get_kind(unsigned node)
 }
 #define starpu_node_get_kind _starpu_node_get_kind
 
+#if STARPU_MAXNODES == 1
+#define _starpu_memory_nodes_get_count() 1
+#else
 static inline unsigned _starpu_memory_nodes_get_count(void)
 {
 	return _starpu_descr.nnodes;
 }
+#endif
 #define starpu_memory_nodes_get_count _starpu_memory_nodes_get_count
 
+#if STARPU_MAXNODES == 1
+#define _starpu_worker_get_memory_node(workerid) 0
+#else
 static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -139,6 +146,20 @@ static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 	return config->combined_workers[workerid - nworkers].memory_node;
 
 }
+#endif
 #define starpu_worker_get_memory_node _starpu_worker_get_memory_node
 
+#if STARPU_MAXNODES == 1
+#define _starpu_worker_get_local_memory_node() 0
+#else
+static inline unsigned _starpu_worker_get_local_memory_node(void)
+{
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker)
+		return STARPU_MAIN_RAM;
+	return worker->memory_node;
+}
+#endif
+#define starpu_worker_get_local_memory_node _starpu_worker_get_local_memory_node
+
 #endif // __MEMORY_NODES_H__

+ 11 - 2
src/datawizard/reduction.c

@@ -280,12 +280,21 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					redux_task->cl = handle->redux_cl;
 					STARPU_ASSERT(redux_task->cl);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
-						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW|STARPU_COMMUTE, 0);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
+					STARPU_ASSERT_MSG((STARPU_CODELET_GET_MODE(redux_task->cl, 0) & ~STARPU_COMMUTE) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0) & STARPU_COMMUTE))
+					{
+						static int warned;
+						if (!warned)
+						{
+							warned = 1;
+							_STARPU_DISP("Warning: for reductions, codelet %p should have STARPU_COMMUTE along STARPU_RW\n", redux_task->cl);
+						}
+					}
 
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);

+ 11 - 8
src/datawizard/user_interactions.c

@@ -53,7 +53,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
 	_starpu_spin_lock(&handle->header_lock);
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, NULL, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
 
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
@@ -126,7 +126,7 @@ static inline void _starpu_data_acquire_launch_fetch(struct user_interaction_wra
 	starpu_data_handle_t handle = wrapper->handle;
 	struct _starpu_data_replicate *replicate = node >= 0 ? &handle->per_node[node] : NULL;
 
-	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
+	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, NULL, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
 	STARPU_ASSERT(!ret);
 }
 
@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 							  void (*callback)(void *arg),
 							  void *arg,
 							  int sequential_consistency, int quick,
-							  long *pre_sync_jobid, long *post_sync_jobid)
+							  long *pre_sync_jobid, long *post_sync_jobid, int prio)
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
@@ -211,6 +211,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 	wrapper->callback_arg = arg;
 	wrapper->pre_sync_task = NULL;
 	wrapper->post_sync_task = NULL;
+	wrapper->prio = prio;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int handle_sequential_consistency = handle->sequential_consistency;
@@ -225,6 +226,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
 		wrapper->pre_sync_task->callback_arg = wrapper;
 		wrapper->pre_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
+		wrapper->pre_sync_task->priority = prio;
 		pre_sync_job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
 		if (pre_sync_jobid)
 			*pre_sync_jobid = pre_sync_job->job_id;
@@ -233,6 +235,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
 		wrapper->post_sync_task->detach = 1;
 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
+		wrapper->post_sync_task->priority = prio;
 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
 		if (post_sync_jobid)
 			*post_sync_jobid = post_sync_job->job_id;
@@ -280,7 +283,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
 							  int sequential_consistency, int quick)
 {
-	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL);
+	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL, STARPU_DEFAULT_PRIO);
 }
 
 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node,
@@ -616,7 +619,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, STARPU_DEFAULT_PRIO);
 }
 
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
@@ -626,7 +629,7 @@ int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
-	return starpu_data_prefetch_on_node_prio(handle, node, async, 0);
+	return starpu_data_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
 }
 
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
@@ -636,7 +639,7 @@ int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned
 
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
-	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, 0);
+	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
 }
 
 static void _starpu_data_wont_use(void *data)
@@ -817,7 +820,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			if (handle->per_node[memory_node].requested & (1UL << node))
+			if (handle->per_node[memory_node].request[node])
 			{
 				requested = 1;
 				break;

+ 2 - 2
src/datawizard/write_back.c

@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 				{
 					cpt++;
-					__starpu_datawizard_progress(1, 1);
+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 				}
 				if (cpt == STARPU_SPIN_MAXTRY)
 					_starpu_spin_lock(&handle->header_lock);
@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, NULL, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */

+ 2 - 2
src/debug/latency.c

@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}

+ 51 - 42
src/debug/traces/starpu_fxt.c

@@ -251,11 +251,12 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "Modes:");
 		for (i = 0; i < task->ndata; i++)
-			fprintf(tasks_file, " %s%s%s%s%s",
+			fprintf(tasks_file, " %s%s%s%s%s%s",
 				(task->data[i].mode & STARPU_R)?"R":"",
 				(task->data[i].mode & STARPU_W)?"W":"",
 				(task->data[i].mode & STARPU_SCRATCH)?"S":"",
 				(task->data[i].mode & STARPU_REDUX)?"X":"",
+				(task->data[i].mode & STARPU_MPI_REDUX)?"X-mpi":"",
 				(task->data[i].mode & STARPU_COMMUTE)?"C":"");
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "Sizes:");
@@ -763,15 +764,20 @@ static void memnode_pop_state(double time, const char *prefix, unsigned int memn
 #endif
 }
 
-static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long info, unsigned long size, unsigned int dest, struct starpu_fxt_options *options)
+static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long value, unsigned long info, long size_prio, unsigned int dest, struct starpu_fxt_options *options)
 {
 	if (!options->memory_states)
 		return;
+	// If there is not a valid memory node, we cant associate it
+	if((int)memnodeid < 0)
+		return;
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
 	char p_handle[STARPU_POTI_STR_LEN];
+	char p_value[STARPU_POTI_STR_LEN];
 	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
 	snprintf(p_handle, sizeof(p_handle), "%lx", handle);
+	snprintf(p_value, sizeof(p_value), "%lx", value);
 
 #ifdef HAVE_POTI_USER_NEWEVENT
 	char p_dest[STARPU_POTI_STR_LEN];
@@ -780,15 +786,15 @@ static void memnode_event(double time, const char *prefix, unsigned int memnodei
 
 	memmanager_container_alias(p_dest, STARPU_POTI_STR_LEN, prefix, dest);
 	snprintf(p_info, sizeof(p_info), "%lu", info);
-	snprintf(p_size, sizeof(p_size), "%lu", size);
+	snprintf(p_size, sizeof(p_size), "%ld", size_prio);
 
-	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, "0", 4,
+	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, p_value, 4,
 			   p_handle, p_info, p_size, p_dest);
 #else
 	poti_NewEvent(time, container, name, p_handle);
 #endif
 #else
-	fprintf(out_paje_file, "22    %.9f    %s %smm%u  0 %lx %lu %lu %smm%u\n", time, name, prefix, memnodeid, handle, info, size, prefix, dest);
+	fprintf(out_paje_file, "22    %.9f    %s %smm%u  %lx %lx %lu %ld %smm%u\n", time, name, prefix, memnodeid, value, handle, info, size_prio, prefix, dest);
 #endif
 }
 
@@ -2232,7 +2238,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		{
 			double time = get_event_time_stamp(ev, options);
 			memnode_push_state(time, prefix, dst, "Co");
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, comid, size, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, 0, comid, size, src, options);
 #ifdef STARPU_HAVE_POTI
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_memnode_container[STARPU_POTI_STR_LEN];
 			char program_container[STARPU_POTI_STR_LEN];
@@ -2351,7 +2357,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		{
 			double time = get_event_time_stamp(ev, options);
 			memnode_pop_state(time, prefix, dst);
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, comid, size, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, 0, comid, size, src, options);
 #ifdef STARPU_HAVE_POTI
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
 			char dst_memnode_container[STARPU_POTI_STR_LEN], program_container[STARPU_POTI_STR_LEN];
@@ -2378,7 +2384,7 @@ static void handle_start_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_f
 		if (out_paje_file)
 		{
 			memnode_push_state(get_event_time_stamp(ev, options), prefix, dst, "CoA");
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, 0, src, options);
 		}
 
 }
@@ -2394,7 +2400,7 @@ static void handle_end_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt
 		if (out_paje_file)
 		{
 			memnode_pop_state(get_event_time_stamp(ev, options), prefix, dst);
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, 0, src, options);
 		}
 }
 
@@ -2408,32 +2414,36 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 }
 
+static void handle_data_request(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
+	unsigned memnode = ev->param[0];
+	unsigned dest = ev->param[1];
+	unsigned prio = ev->param[2];
+	unsigned long handle = ev->param[3];
+	unsigned prefe = ev->param[4];
+	unsigned long request = ev->param[5];
+
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, request, prefe, prio, dest, options);
+}
+
 static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 {
 	unsigned memnode = ev->param[0];
 	unsigned size = ev->param[2];
 	unsigned long handle = ev->param[3];
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, size, memnode, options);
 }
 
 static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 {
 	unsigned memnode = ev->param[0];
-	unsigned dest = ev->param[1];
-	if(strcmp(eventstr, "rc")==0)
-	{
-		//If it is a Request Create, use dest normally
-	}
-	else
-	{
-		dest = memnode;
-	}
+	//unsigned dest = ev->param[1]; // Not used
 	unsigned size = ev->param[2];
 	unsigned long handle = ev->param[3];
 	unsigned prefe = ev->param[4];
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, prefe, size, memnode, options);
 }
 
 static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2442,7 +2452,7 @@ static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_o
 	unsigned long handle = ev->param[2];
 	unsigned info = ev->param[3];
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, info, 0, memnode, options);
 }
 
 static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2450,7 +2460,7 @@ static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt
 	unsigned memnode = ev->param[0];
 	unsigned long handle = ev->param[2];
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
 }
 
 static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2458,7 +2468,7 @@ static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_o
 	unsigned memnode = ev->param[0];
 	unsigned long handle = ev->param[2];
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
 }
 
 static void handle_push_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -3702,13 +3712,12 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				if (options->memory_states)
 					handle_data_state(&ev, options, "SS");
 				break;
-                       case _STARPU_FUT_DATA_REQUEST_CREATED:
-                               if (!options->no_bus && options->memory_states)
-                               {
-                                       handle_memnode_event_start_4(&ev, options, "rc");
-                               }
-                               break;
-
+			case _STARPU_FUT_DATA_REQUEST_CREATED:
+				if (!options->no_bus && options->memory_states)
+				{
+					handle_data_request(&ev, options, "rc");
+				}
+				break;
 		  case _STARPU_FUT_PAPI_TASK_EVENT_VALUE:
 				handle_papi_event(&ev, options);
 				break;
@@ -4207,18 +4216,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		free(options->worker_archtypes[i].devices);
 
-	struct _starpu_symbol_name *itor, *next;
-	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
-		itor != _starpu_symbol_name_list_end(&symbol_list);
-		itor = next)
-	{
-		next = _starpu_symbol_name_list_next(itor);
-
-		_starpu_symbol_name_list_erase(&symbol_list, itor);
-		free(itor->name);
-		_starpu_symbol_name_delete(itor);
-	}
-
 	_starpu_fxt_component_deinit();
 
 	free_worker_ids();
@@ -4608,6 +4605,17 @@ void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 static
 void _starpu_fxt_paje_file_close(void)
 {
+	struct _starpu_symbol_name *itor, *next;
+	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
+		itor != _starpu_symbol_name_list_end(&symbol_list);
+		itor = next)
+	{
+		next = _starpu_symbol_name_list_next(itor);
+
+		_starpu_symbol_name_list_erase(&symbol_list, itor);
+		free(itor->name);
+		_starpu_symbol_name_delete(itor);
+	}
 	if (out_paje_file)
 		fclose(out_paje_file);
 }
@@ -4658,6 +4666,7 @@ uint64_t _starpu_fxt_find_start_time(char *filename_in)
 
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 {
+	starpu_drivers_preinit();
 	_starpu_fxt_options_set_dir(options);
 	_starpu_fxt_dag_init(options->dag_path);
 	_starpu_fxt_distrib_file_init(options);

+ 3 - 2
src/drivers/cpu/driver_cpu.c

@@ -40,6 +40,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/simgrid.h>
 #include <core/task.h>
 #include <core/disk.h>
@@ -341,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 		return ret;
 	}
 
-	res = __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	if (!pending_task)
 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
@@ -429,7 +430,7 @@ int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
 	unsigned memnode = cpu_worker->memory_node;
-	_starpu_handle_all_pending_node_data_requests(memnode);
+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data

+ 4 - 36
src/drivers/cuda/driver_cuda.c

@@ -37,6 +37,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/task.h>
 #include <common/knobs.h>
 
@@ -935,14 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	if (!idle_tasks)
 	{
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 	}
 #endif
 
 	/* Something done, make some progress */
-	res = !idle_tasks || !idle_transfers;
-	res |= __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	/* And pull tasks */
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
@@ -950,9 +950,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 #ifdef STARPU_SIMGRID
 	if (!res)
 		starpu_pthread_wait_wait(&worker0->wait);
-#else
-	if (!res)
-		return 0;
 #endif
 
 	for (i = 0; i < (int) worker_set->nworkers; i++)
@@ -972,35 +969,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		{
 			/* this is neither a cuda or a cublas task */
 			_starpu_worker_refuse_task(worker, task);
-#if 0
-			if (worker->pipeline_length)
-			{
-				int j;
-				for (j = 0; j < worker->ntasks; j++)
-				{
-					const int j_mod = (j+worker->first_task)%STARPU_MAX_PIPELINE;
-					if (task == worker->current_tasks[j_mod])
-					{
-						worker->current_tasks[j_mod] = NULL;
-						if (j == 0)
-						{
-							worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
-							_starpu_set_current_task(NULL);
-						}
-						break;
-					}
-				}
-				STARPU_ASSERT(j<worker->ntasks);
-			}
-			else
-			{
-				worker->current_task = NULL;
-				_starpu_set_current_task(NULL);
-			}
-			worker->ntasks--;
-			int res = _starpu_push_task_to_workers(task);
-			STARPU_ASSERT_MSG(res == 0, "_starpu_push_task_to_workers() unexpectedly returned = %d\n", res);
-#endif
 			continue;
 		}
 
@@ -1039,7 +1007,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 		if (!usersleft)
                 {
 			/* I'm last, deinitialize device */
-			_starpu_handle_all_pending_node_data_requests(memnode);
+			_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 			/* In case there remains some memory that was automatically
 			 * allocated by StarPU, we release it now. Note that data

+ 3 - 3
src/drivers/mp_common/source_common.c

@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 		}
 	}
 
-        res |= __starpu_datawizard_progress(1, 1);
+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
         /* Handle message which have been store */
         _starpu_src_common_handle_stored_async(mp_node);
@@ -1075,7 +1075,7 @@ void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set, int
         for (device = 0; device < ndevices; device++)
 	{
         	_STARPU_TRACE_END_PROGRESS(memnode[device]);
-                _starpu_handle_all_pending_node_data_requests(memnode[device]);
+                _starpu_datawizard_handle_all_pending_node_data_requests(memnode[device]);
 	}
 
         /* In case there remains some memory that was automatically
@@ -1107,7 +1107,7 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, unsigned
 
         _STARPU_TRACE_END_PROGRESS(memnode);
 
-        _starpu_handle_all_pending_node_data_requests(memnode);
+        _starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
         /* In case there remains some memory that was automatically
          * allocated by StarPU, we release it now. Note that data

+ 4 - 4
src/drivers/opencl/driver_opencl.c

@@ -31,6 +31,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/task.h>
 #include <common/knobs.h>
 
@@ -787,13 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 	if (!idle_tasks)
 	{
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 	}
 #endif
 
-	res = !idle_tasks || !idle_transfers;
-	res |= __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 
@@ -840,7 +840,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 
 	unsigned memnode = worker->memory_node;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data

+ 3 - 0
src/profiling/profiling.c

@@ -114,6 +114,9 @@ int starpu_profiling_status_set(int status)
 	{
 		struct _starpu_worker *worker_struct = _starpu_get_worker_struct(worker);
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_struct->sched_mutex);
+	}
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[worker]);
 	}
 

+ 1 - 1
src/sched_policies/component_best_implementation.c

@@ -112,7 +112,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 	}
 	if(task)
 		/* this worker can execute this task as it was returned by a pop*/
-		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id_check());
+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_bitmap_first(&component->workers_in_ctx));
 	return task;
 }
 

+ 3 - 1
src/sched_policies/component_fifo.c

@@ -180,8 +180,10 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 	struct starpu_task * task;
 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
+	else if (to->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
+		task = _starpu_fifo_pop_task(queue, starpu_bitmap_first(&to->workers_in_ctx));
 	else
-		task = _starpu_fifo_pop_task(queue, starpu_worker_get_id_check());
+		task = _starpu_fifo_pop_task(queue, -1);
 	if(task && data->exp)
 	{
 		if(!isnan(task->predicted))

+ 3 - 3
src/sched_policies/component_worker.c

@@ -443,8 +443,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 		if(task)
 		{
 			_starpu_worker_task_list_transfer_started(list, task);
-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			starpu_push_task_end(task);
+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			goto ret;
 		}
 		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
@@ -470,8 +470,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 			STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 			_starpu_worker_task_list_add(list, task);
 			_starpu_worker_task_list_transfer_started(list, task);
-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			starpu_push_task_end(task);
+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			goto ret;
 		}
 		struct starpu_sched_component * combined_worker_component = starpu_sched_component_worker_get(component->tree->sched_ctx_id, workerid);
@@ -486,8 +486,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 		STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 		_starpu_worker_task_list_add(list, task);
 		_starpu_worker_task_list_transfer_started(list, task);
-		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 		starpu_push_task_end(task);
+		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 	}
 ret:
 	return task;

+ 25 - 2
src/sched_policies/fifo_queues.c

@@ -352,6 +352,29 @@ int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_
 	return ((num_priorities-1)/(max-min)) * (priority - min);
 }
 
+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker)
+{
+	size_t cnt = 0;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	unsigned index;
+
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle;
+		unsigned buffer_node = _starpu_task_data_get_node_on_worker(task, index, worker);
+
+		handle = STARPU_TASK_GET_HANDLE(task, index);
+
+		int is_valid;
+		starpu_data_query_status(handle, buffer_node, NULL, &is_valid, NULL);
+
+		if (!is_valid)
+			cnt+=starpu_data_get_size(handle);
+	}
+
+	return cnt;
+}
+
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker)
 {
 	int cnt = 0;
@@ -392,7 +415,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
 		int first_task_priority = task->priority;
 
-		int non_ready_best = INT_MAX;
+		size_t non_ready_best = SIZE_MAX;
 
 		for (current = task; current; current = current->next)
 		{
@@ -400,7 +423,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
 			if (priority >= first_task_priority)
 			{
-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
 				if (non_ready < non_ready_best)
 				{
 					non_ready_best = non_ready;

+ 1 - 0
src/sched_policies/fifo_queues.h

@@ -69,6 +69,7 @@ struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo)
 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, int workerid);
 int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_id);
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker);
+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker);
 struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned workerid, int num_priorities);
 
 #endif // __FIFO_QUEUES_H__

+ 2 - 2
src/sched_policies/prio_deque.c

@@ -94,7 +94,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 			return NULL;
 
 		int first_task_priority = task->priority;
-		int non_ready_best = INT_MAX;
+		size_t non_ready_best = SIZE_MAX;
 
 		for (current = starpu_task_prio_list_begin(&pdeque->list);
 		     current != starpu_task_prio_list_end(&pdeque->list);
@@ -104,7 +104,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
 			if (priority >= first_task_priority)
 			{
-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
 				if (non_ready < non_ready_best)
 				{
 					non_ready_best = non_ready;

+ 5 - 0
src/sched_policies/work_stealing_policy.c

@@ -610,6 +610,11 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	if (_starpu_worker_trylock(victim))
 	{
 		/* victim is busy, don't bother it, come back later */
+#ifdef STARPU_SIMGRID
+		starpu_sleep(0.000001);
+		/* Make sure we come back and not block */
+		starpu_wake_worker_no_relax(workerid);
+#endif
 		return NULL;
 	}
 	if (ws->per_worker[victim].running && ws->per_worker[victim].queue.ntasks > 0)

+ 0 - 0
src/util/execute_on_all.c


部分文件因文件數量過多而無法顯示