Преглед на файлове

Merge branch 'master' of gitlab.inria.fr:starpu/starpu

HE Kun преди 5 години
родител
ревизия
83bc792574
променени са 100 файла, в които са добавени 2708 реда и са изтрити 883 реда
  1. 10 1
      .gitlab-ci.yml
  2. 1 0
      AUTHORS
  3. 3 0
      ChangeLog
  4. 2 0
      Makefile.am
  5. 21 16
      configure.ac
  6. 18 3
      contrib/ci.inria.fr/job-1-check.sh
  7. 22 0
      contrib/gitlab/simgrid.sh
  8. 1 1
      doc/doxygen/chapters/101_building.doxy
  9. 2 1
      doc/doxygen/chapters/310_data_management.doxy
  10. 22 34
      doc/doxygen/chapters/320_scheduling.doxy
  11. 4 4
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  12. 31 0
      doc/doxygen/chapters/410_mpi_support.doxy
  13. 31 0
      doc/doxygen/chapters/501_environment_variables.doxy
  14. 1 1
      doc/doxygen/chapters/code/disk_copy.c
  15. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
  16. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
  17. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
  18. 1 1
      examples/Makefile.am
  19. 6 2
      examples/basic_examples/multiformat_conversion_codelets.c
  20. 54 158
      examples/cg/cg.c
  21. 0 25
      examples/cg/cg.h
  22. 216 37
      examples/cg/cg_kernels.c
  23. 1 1
      examples/pi/pi_redux.c
  24. 1 1
      examples/reductions/dot_product.c
  25. 1 1
      examples/reductions/minmax_reduction.c
  26. 13 0
      include/fstarpu_mod.f90
  27. 8 0
      include/starpu.h
  28. 14 4
      include/starpu_data.h
  29. 8 0
      include/starpu_hash.h
  30. 3 5
      include/starpu_perfmodel.h
  31. 28 2
      include/starpu_task.h
  32. 4 0
      include/starpu_util.h
  33. 2 2
      julia/README
  34. 2 2
      julia/examples/execute.sh.in
  35. 2 2
      julia/setenv.sh
  36. 1 1
      julia/src/StarPU.jl
  37. 89 2
      mpi/examples/Makefile.am
  38. 422 0
      mpi/examples/cg/cg.c
  39. 201 0
      mpi/examples/mpi_redux/mpi_redux.c
  40. 253 0
      mpi/examples/native_fortran/nf_mpi_redux.f90
  41. 238 0
      mpi/examples/native_fortran/nf_redux_test.f90
  42. 9 0
      mpi/include/starpu_mpi.h
  43. 0 1
      mpi/src/mpi/starpu_mpi_early_data.h
  44. 40 34
      mpi/src/mpi/starpu_mpi_mpi.c
  45. 0 2
      mpi/src/mpi/starpu_mpi_mpi_backend.c
  46. 0 1
      mpi/src/mpi/starpu_mpi_mpi_backend.h
  47. 28 9
      mpi/src/starpu_mpi.c
  48. 1 2
      mpi/src/starpu_mpi_coop_sends.c
  49. 16 6
      mpi/src/starpu_mpi_private.h
  50. 55 20
      mpi/src/starpu_mpi_task_insert.c
  51. 1 1
      mpi/src/starpu_mpi_task_insert_fortran.c
  52. 1 1
      mpi/tests/mpi_reduction.c
  53. 3 0
      mpi/tests/mpi_redux.c
  54. 20 20
      src/common/fxt.h
  55. 5 0
      src/common/hash.c
  56. 4 4
      src/common/uthash.h
  57. 1 1
      src/core/dependencies/data_arbiter_concurrency.c
  58. 2 2
      src/core/dependencies/data_concurrency.c
  59. 6 2
      src/core/dependencies/implicit_data_deps.c
  60. 1 2
      src/core/jobs.c
  61. 18 11
      src/core/perfmodel/energy_model.c
  62. 8 4
      src/core/perfmodel/perfmodel_bus.c
  63. 4 3
      src/core/perfmodel/perfmodel_history.c
  64. 1 20
      src/core/sched_policy.c
  65. 0 2
      src/core/sched_policy.h
  66. 10 0
      src/core/workers.c
  67. 78 61
      src/datawizard/coherency.c
  68. 10 10
      src/datawizard/coherency.h
  69. 3 3
      src/datawizard/copy_driver.c
  70. 8 1
      src/datawizard/copy_driver.h
  71. 306 179
      src/datawizard/data_request.c
  72. 34 13
      src/datawizard/data_request.h
  73. 87 25
      src/datawizard/datawizard.c
  74. 8 7
      src/datawizard/datawizard.h
  75. 1 1
      src/datawizard/filters.c
  76. 5 2
      src/datawizard/interfaces/data_interface.c
  77. 10 0
      src/datawizard/malloc.c
  78. 7 0
      src/datawizard/malloc.h
  79. 21 11
      src/datawizard/memalloc.c
  80. 1 1
      src/datawizard/memalloc.h
  81. 3 4
      src/datawizard/memory_nodes.c
  82. 21 0
      src/datawizard/memory_nodes.h
  83. 11 2
      src/datawizard/reduction.c
  84. 11 8
      src/datawizard/user_interactions.c
  85. 2 2
      src/datawizard/write_back.c
  86. 2 2
      src/debug/latency.c
  87. 51 42
      src/debug/traces/starpu_fxt.c
  88. 3 2
      src/drivers/cpu/driver_cpu.c
  89. 4 36
      src/drivers/cuda/driver_cuda.c
  90. 3 3
      src/drivers/mp_common/source_common.c
  91. 4 4
      src/drivers/opencl/driver_opencl.c
  92. 3 0
      src/profiling/profiling.c
  93. 1 1
      src/sched_policies/component_best_implementation.c
  94. 3 1
      src/sched_policies/component_fifo.c
  95. 3 3
      src/sched_policies/component_worker.c
  96. 25 2
      src/sched_policies/fifo_queues.c
  97. 1 0
      src/sched_policies/fifo_queues.h
  98. 2 2
      src/sched_policies/prio_deque.c
  99. 5 0
      src/sched_policies/work_stealing_policy.c
  100. 0 0
      src/util/execute_on_all.c

+ 10 - 1
.gitlab-ci.yml

@@ -30,7 +30,7 @@ build:
       when: never  # Prevent pipeline run for push event
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
     - when: always # Run pipeline for all other cases
 
 
-deploy:
+check:
   stage: deploy
   stage: deploy
   script:
   script:
     - ./contrib/gitlab/deploy.sh
     - ./contrib/gitlab/deploy.sh
@@ -38,3 +38,12 @@ deploy:
     - if: '$CI_PIPELINE_SOURCE == "push"'
     - if: '$CI_PIPELINE_SOURCE == "push"'
       when: never  # Prevent pipeline run for push event
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
     - when: always # Run pipeline for all other cases
+
+simgrid:
+  stage: deploy
+  script:
+    - ./contrib/gitlab/simgrid.sh
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "push"'
+      when: never  # Prevent pipeline run for push event
+    - when: always # Run pipeline for all other cases

+ 1 - 0
AUTHORS

@@ -17,6 +17,7 @@ Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 He Kun, Inria, <kun.he@inria.fr>
 He Kun, Inria, <kun.he@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
+Jego Antoine, Enseeiht, <antoine.jego@etu.enseeiht.fr>
 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
 Juven Alexis, Inria, <alexis.juven@inria.fr>
 Juven Alexis, Inria, <alexis.juven@inria.fr>
 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>
 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>

+ 3 - 0
ChangeLog

@@ -51,9 +51,11 @@ New features:
     starpu_mpi_interface_datatype_node_register which will be needed for
     starpu_mpi_interface_datatype_node_register which will be needed for
     MPI/NUMA/GPUDirect.
     MPI/NUMA/GPUDirect.
   * Add peek_data interface method.
   * Add peek_data interface method.
+  * Add STARPU_MPI_REDUX
 
 
 Small changes:
 Small changes:
   * Add a synthetic energy efficiency testcase.
   * Add a synthetic energy efficiency testcase.
+  * Make reduction methods want the commute flag.
 
 
 StarPU 1.3.8
 StarPU 1.3.8
 ====================================================================
 ====================================================================
@@ -67,6 +69,7 @@ Small features:
     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
     instead of hyperthreads.
     instead of hyperthreads.
   * New STARPU_TASK_PROGRESS environment variable to show task progression.
   * New STARPU_TASK_PROGRESS environment variable to show task progression.
+  * Add STARPU_SIMGRID environment variable guard against native builds.
 
 
 StarPU 1.3.7
 StarPU 1.3.7
 ====================================================================
 ====================================================================

+ 2 - 0
Makefile.am

@@ -53,9 +53,11 @@ if STARPU_BUILD_STARPURM
 SUBDIRS += starpurm
 SUBDIRS += starpurm
 endif
 endif
 
 
+if STARPU_USE_CPU
 if STARPU_BUILD_STARPUPY
 if STARPU_BUILD_STARPUPY
 SUBDIRS += starpupy
 SUBDIRS += starpupy
 endif
 endif
+endif
 
 
 if STARPU_BUILD_SC_HYPERVISOR
 if STARPU_BUILD_SC_HYPERVISOR
 SUBDIRS += sc_hypervisor
 SUBDIRS += sc_hypervisor

+ 21 - 16
configure.ac

@@ -167,9 +167,8 @@ if test x$enable_simgrid = xyes ; then
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
 	fi
 	fi
-	if test -n "$SIMGRID_LIBS" ; then
-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
-	fi
+	SAVED_LIBS="${LIBS}"
+	LIBS="$SIMGRID_LIBS $LIBS"
 	AC_HAVE_LIBRARY([simgrid], [],
 	AC_HAVE_LIBRARY([simgrid], [],
 		[
 		[
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
@@ -207,6 +206,7 @@ if test x$enable_simgrid = xyes ; then
 
 
 	# Oldies for compatibility with older simgrid
 	# Oldies for compatibility with older simgrid
 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
+	LIBS="${SAVED_LIBS}"
 
 
 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
 	# We won't bind or detect anything
 	# We won't bind or detect anything
@@ -225,6 +225,7 @@ if test x$enable_simgrid = xyes ; then
 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
 		LIBS="$LIBS -lstdc++"
 		LIBS="$LIBS -lstdc++"
 	fi
 	fi
+	SIMGRID_LDFLAGS="$SIMGRID_LIBS -lsimgrid"
 
 
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	case \ $CXXFLAGS\  in
 	case \ $CXXFLAGS\  in
@@ -267,13 +268,13 @@ if test x$enable_simgrid = xyes ; then
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
 		# libsimgrid needs to be linked from binaries themselves for MC to work
 		# libsimgrid needs to be linked from binaries themselves for MC to work
-		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $SIMGRID_LDFLAGS"
 	fi
 	fi
 fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
 AC_SUBST(SIMGRID_CFLAGS)
-AC_SUBST(SIMGRID_LIBS)
+AC_SUBST(SIMGRID_LDFLAGS)
 AC_MSG_CHECKING(whether SimGrid is enabled)
 AC_MSG_CHECKING(whether SimGrid is enabled)
 AC_MSG_RESULT($enable_simgrid)
 AC_MSG_RESULT($enable_simgrid)
 
 
@@ -2304,9 +2305,6 @@ if test x$maxnodes = x0 ; then
 	if test x$enable_simgrid = xyes ; then
 	if test x$enable_simgrid = xyes ; then
 		# We need the room for the virtual CUDA/OpenCL devices
 		# We need the room for the virtual CUDA/OpenCL devices
 		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
 		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
-		if test $nodes -gt 32 ; then
-			nodes=32
-		fi
 	else
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
 		# and per MIC device
@@ -2342,8 +2340,7 @@ if test x$maxnodes = x0 ; then
 	done
 	done
 fi
 fi
 if test $maxnodes -gt 32 ; then
 if test $maxnodes -gt 32 ; then
-	# FIXME: at least use uint64 so we can have 64 memory nodes
-	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
+	AC_MSG_WARN([Note: the wt_mask feature only supports 32 memory nodes])
 fi
 fi
 
 
 AC_MSG_CHECKING(maximum number of memory nodes)
 AC_MSG_CHECKING(maximum number of memory nodes)
@@ -3448,6 +3445,14 @@ then
 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
 	fi
 	fi
 	AC_SUBST(PYTHON)
 	AC_SUBST(PYTHON)
+	PYTHON_INCLUDE_DIRS="`$PYTHON -c "from sysconfig import get_paths as gp; print(gp()@<:@'include'@:>@)"`"
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	CPPFLAGS="$CPPFLAGS -I$PYTHON_INCLUDE_DIRS"
+	AC_CHECK_HEADERS([Python.h],[have_python_h=yes],[have_python_h=no])
+	if test "$have_python_h" = "no" ; then
+		AC_MSG_ERROR([Python.h missing, cannot build StarPU python interface (consider installing python-dev)])
+	fi
+	CPPFLAGS=${SAVED_CPPFLAGS}
 	AC_MSG_CHECKING(for python3 module joblib)
 	AC_MSG_CHECKING(for python3 module joblib)
 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
 	AC_MSG_RESULT($joblib_avail)
 	AC_MSG_RESULT($joblib_avail)
@@ -3565,7 +3570,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 AC_SUBST([STARPU_H_CPPFLAGS])
 AC_SUBST([STARPU_H_CPPFLAGS])
 
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 
 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
@@ -3805,11 +3810,11 @@ AC_MSG_NOTICE([
 	       OpenMP runtime support enabled:                $enable_openmp
 	       OpenMP runtime support enabled:                $enable_openmp
 	       Cluster support enabled:                       $enable_cluster
 	       Cluster support enabled:                       $enable_cluster
 	       SOCL enabled:                                  $build_socl
 	       SOCL enabled:                                  $build_socl
-               SOCL test suite:                               $run_socl_check
-               Scheduler Hypervisor:                          $build_sc_hypervisor
-               simgrid enabled:                               $enable_simgrid
-               ayudame enabled:                               $ayu_msg
-               HDF5 enabled:                                  $enable_hdf5
+	       SOCL test suite:                               $run_socl_check
+	       Scheduler Hypervisor:                          $build_sc_hypervisor
+	       simgrid enabled:                               $enable_simgrid
+	       ayudame enabled:                               $ayu_msg
+	       HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr
 	       Support for multiple linear regression models: $support_mlr

+ 18 - 3
contrib/ci.inria.fr/job-1-check.sh

@@ -37,7 +37,11 @@ basename=$(basename $tarball .tar.gz)
 export STARPU_HOME=$PWD/$basename/home
 export STARPU_HOME=$PWD/$basename/home
 mkdir -p $basename
 mkdir -p $basename
 cd $basename
 cd $basename
-env > $PWD/env
+(
+    echo "oldPWD=\${PWD}"
+    env|grep -v LS_COLORS | grep '^[A-Z]'|grep -v BASH_FUNC | grep '=' | sed 's/=/=\"/'| sed 's/$/\"/' | sed 's/^/export /'
+    echo "cd \$oldPWD"
+) > ${PWD}/env
 
 
 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
 tar xfz ../$tarball
 tar xfz ../$tarball
@@ -63,7 +67,17 @@ fi
 
 
 export CC=gcc
 export CC=gcc
 
 
-CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
+set +e
+mpiexec -oversubscribe pwd 2>/dev/null
+ret=$?
+set -e
+ARGS=""
+if test "$ret" = "0"
+then
+    ARGS="--with-mpiexec-args=-oversubscribe"
+fi
+
+CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc $ARGS"
 CONFIGURE_CHECK=""
 CONFIGURE_CHECK=""
 day=$(date +%u)
 day=$(date +%u)
 if test $day -le 5
 if test $day -le 5
@@ -72,10 +86,11 @@ then
 #else
 #else
     # we do a normal check, a long check takes too long on VM nodes
     # we do a normal check, a long check takes too long on VM nodes
 fi
 fi
-../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
+../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS $STARPU_USER_CONFIGURE_OPTIONS
 
 
 export STARPU_TIMEOUT_ENV=1800
 export STARPU_TIMEOUT_ENV=1800
 export MPIEXEC_TIMEOUT=1800
 export MPIEXEC_TIMEOUT=1800
+
 make
 make
 #make check
 #make check
 (make -k check || true) 2>&1 | tee  ../check_$$
 (make -k check || true) 2>&1 | tee  ../check_$$

+ 22 - 0
contrib/gitlab/simgrid.sh

@@ -0,0 +1,22 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+STARPU_USER_CONFIGURE_OPTIONS="--enable-simgrid --disable-mpi --disable-mpi-check" ./contrib/ci.inria.fr/job-1-check.sh
+
+
+
+

+ 1 - 1
doc/doxygen/chapters/101_building.doxy

@@ -520,7 +520,7 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 using the SimGrid version of StarPU: first install the SimGrid simulator from
 using the SimGrid version of StarPU: first install the SimGrid simulator from
 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
-3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with \c -Denable_msg=ON.
 Other versions may have compatibility issues, 3.17 notably does
 Other versions may have compatibility issues, 3.17 notably does
 not build at all. MPI simulation does not work with version 3.22).
 not build at all. MPI simulation does not work with version 3.22).
 Then configure StarPU with \ref enable-simgrid
 Then configure StarPU with \ref enable-simgrid

+ 2 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -643,7 +643,8 @@ struct starpu_codelet accumulate_variable_cl =
         .cpu_funcs = { accumulate_variable_cpu },
         .cpu_funcs = { accumulate_variable_cpu },
         .cpu_funcs_name = { "accumulate_variable_cpu" },
         .cpu_funcs_name = { "accumulate_variable_cpu" },
         .cuda_funcs = { accumulate_variable_cuda },
         .cuda_funcs = { accumulate_variable_cuda },
-        .nbuffers = 1,
+        .nbuffers = 2,
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 }
 }
 \endcode
 \endcode
 
 

Файловите разлики са ограничени, защото са твърде много
+ 22 - 34
doc/doxygen/chapters/320_scheduling.doxy


+ 4 - 4
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -515,12 +515,12 @@ The <c>-f</c> option can also be used to display the performance in terms of GFl
 
 
 \verbatim
 \verbatim
 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
-$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
-$ gv starpu_non_linear_memset_regression_based_energy.eps
+$ gnuplot starpu_gflops_non_linear_memset_regression_based_energy.gp
+$ gv starpu_gflops_non_linear_memset_regression_based_energy.eps
 \endverbatim
 \endverbatim
 
 
-\image html starpu_non_linear_memset_regression_based_energy_flops.png
-\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
+\image html starpu_gflops_non_linear_memset_regression_based_energy.png
+\image latex starpu_gflops_non_linear_memset_regression_based_energy.eps "" width=\textwidth
 
 
 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
 
 

+ 31 - 0
doc/doxygen/chapters/410_mpi_support.doxy

@@ -744,6 +744,37 @@ starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
 
 
 The data can then be used just like pernode above.
 The data can then be used just like pernode above.
 
 
+\section MPIMpiRedux Inter-node reduction
+
+One might want to leverage a reduction pattern across several nodes.
+Using \c STARPU_REDUX, one can obtain reduction patterns across several nodes,
+however each core across the contributing nodes will spawn their own
+contribution to work with. In the case that these allocations or the
+required reductions are too expensive to execute for each contribution,
+the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution 
+on node executing tasks partaking in the reduction.
+
+Tasks producing a result in the inter-node reduction should be registered as
+accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode.
+
+\code{.c}
+static struct starpu_codelet contrib_cl =
+{
+	.cpu_funcs = {cpu_contrib}, /* cpu implementation(s) of the routine */
+	.nbuffers = 1, /* number of data handles referenced by this routine */
+	.modes = {STARPU_RW | STARPU_COMMUTE} /* access modes for the contribution */
+	.name = "contribution"
+};
+\endcode
+
+When inserting these tasks, the access mode handed out to the StarPU-MPI layer
+should be \c STARPU_MPI_REDUX. Assuming \c data is owned by node 0 and we want node
+1 to compute the contribution, we could do the following.
+
+\code{.c}
+starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, EXECUTE_ON_NODE, 1); /* Node 1 computes it */
+\endcode
+
 \section MPIPriorities Priorities
 \section MPIPriorities Priorities
 
 
 All send functions have a <c>_prio</c> variant which takes an additional
 All send functions have a <c>_prio</c> variant which takes an additional

+ 31 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -473,6 +473,16 @@ todo
 todo
 todo
 </dd>
 </dd>
 
 
+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
+<dd>
+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+Specify if CUDA workers should do only fast allocations
+when running the datawizard progress of
+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+Default value is 0, allowing CUDA workers to do slow allocations.
+</dd>
+
 </dl>
 </dl>
 
 
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
@@ -738,6 +748,27 @@ block when the memory allocation required for network reception overflows the
 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
 </dd>
 </dd>
 
 
+<dt>STARPU_MPI_EARLYDATA_ALLOCATE</dt>
+<dd>
+\anchor STARPU_MPI_EARLYDATA_ALLOCATE
+\addindex __env__STARPU_MPI_EARLYDATA_ALLOCATE
+When set to 1, the MPI Driver will immediately allocate the data for early
+requests instead of issuing a data request and blocking. The default value is 0,
+issuing a data request. Because it is an early request and we do not know its
+real priority, the data request will assume \ref STARPU_DEFAULT_PRIO. In cases
+where there are many data requests with priorities greater than
+\ref STARPU_DEFAULT_PRIO the MPI drive could be blocked for long periods.
+</dd>
+
+<dt>STARPU_SIMGRID</dt>
+<dd>
+\anchor STARPU_SIMGRID
+\addindex __env__STARPU_SIMGRID
+When set to 1 (the default is 0), this makes StarPU check that it was really
+build with simulation support. This is convenient in scripts to avoid using a
+native version, that would try to update performance models...
+</dd>
+
 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
 <dd>
 <dd>
 \anchor STARPU_SIMGRID_TRANSFER_COST
 \anchor STARPU_SIMGRID_TRANSFER_COST

+ 1 - 1
doc/doxygen/chapters/code/disk_copy.c

@@ -33,7 +33,7 @@
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	double * A,*B,*C,*D,*E,*F;
+	double *A, *F;
 
 
 	/* limit main ram to force to push in disk */
 	/* limit main ram to force to push in disk */
 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);
 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);

doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png


+ 1 - 1
examples/Makefile.am

@@ -106,6 +106,7 @@ examplebin_PROGRAMS =
 noinst_HEADERS = 				\
 noinst_HEADERS = 				\
 	axpy/axpy.h                             \
 	axpy/axpy.h                             \
 	cg/cg.h					\
 	cg/cg.h					\
+	cg/cg_kernels.c				\
 	heat/lu_kernels_model.h			\
 	heat/lu_kernels_model.h			\
 	heat/dw_sparse_cg.h			\
 	heat/dw_sparse_cg.h			\
 	heat/heat.h				\
 	heat/heat.h				\
@@ -869,7 +870,6 @@ if !STARPU_NO_BLAS_LIB
 
 
 cg_cg_SOURCES =					\
 cg_cg_SOURCES =					\
 	cg/cg.c					\
 	cg/cg.c					\
-	cg/cg_kernels.c				\
 	common/blas.c
 	common/blas.c
 
 
 cg_cg_LDADD =					\
 cg_cg_LDADD =					\

+ 6 - 2
examples/basic_examples/multiformat_conversion_codelets.c

@@ -41,6 +41,7 @@ struct starpu_codelet cpu_to_cuda_cl =
 	.cuda_funcs = {cpu_to_cuda_cuda_func},
 	.cuda_funcs = {cpu_to_cuda_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cpu_to_cuda"
 	.name = "codelet_cpu_to_cuda"
 };
 };
 
 
@@ -48,6 +49,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 {
 {
 	.cpu_funcs = {cuda_to_cpu},
 	.cpu_funcs = {cuda_to_cpu},
 	.nbuffers = 1,
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cude_to_cpu"
 	.name = "codelet_cude_to_cpu"
 };
 };
 #endif
 #endif
@@ -73,12 +75,14 @@ struct starpu_codelet cpu_to_opencl_cl =
 {
 {
 	.opencl_funcs = {cpu_to_opencl_opencl_func},
 	.opencl_funcs = {cpu_to_opencl_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 };
 
 
 struct starpu_codelet opencl_to_cpu_cl =
 struct starpu_codelet opencl_to_cpu_cl =
 {
 {
 	.cpu_funcs = {opencl_to_cpu},
 	.cpu_funcs = {opencl_to_cpu},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 };
 #endif
 #endif

+ 54 - 158
examples/cg/cg.c

@@ -19,11 +19,6 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/blas.h>
 #include <common/blas.h>
 
 
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#endif
-
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 
 /*
 /*
  *	Conjugate Gradient
  *	Conjugate Gradient
@@ -68,32 +63,34 @@
 
 
 #include "cg.h"
 #include "cg.h"
 
 
-static int long long n = 4096;
-static int nblocks = 8;
-static int use_reduction = 1;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks);
 
 
-static starpu_data_handle_t A_handle, b_handle, x_handle;
-static TYPE *A, *b, *x;
+#define HANDLE_TYPE_VECTOR starpu_data_handle_t
+#define HANDLE_TYPE_MATRIX starpu_data_handle_t
+#define TASK_INSERT(cl, ...) starpu_task_insert(cl, ##__VA_ARGS__)
+#define GET_VECTOR_BLOCK(v, i) starpu_data_get_sub_data(v, 1, i)
+#define GET_MATRIX_BLOCK(m, i, j) starpu_data_get_sub_data(m, 2, i, j)
+#define BARRIER()
+#define GET_DATA_HANDLE(handle)
+#define FPRINTF_SERVER FPRINTF
+
+#include "cg_kernels.c"
 
 
-#ifdef STARPU_QUICK_CHECK
-static int i_max = 5;
-#elif !defined(STARPU_LONG_CHECK)
-static int i_max = 100;
-#else
-static int i_max = 1000;
-#endif
-static double eps = (10e-14);
 
 
-static starpu_data_handle_t r_handle, d_handle, q_handle;
+
+static TYPE *A, *b, *x;
 static TYPE *r, *d, *q;
 static TYPE *r, *d, *q;
 
 
-static starpu_data_handle_t dtq_handle, rtr_handle;
-static TYPE dtq, rtr;
 
 
-extern struct starpu_codelet accumulate_variable_cl;
-extern struct starpu_codelet accumulate_vector_cl;
-extern struct starpu_codelet bzero_variable_cl;
-extern struct starpu_codelet bzero_vector_cl;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+{
+	unsigned b;
+
+	for (b = 0; b < nblocks; b++)
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}
+
 
 
 /*
 /*
  *	Generate Input data
  *	Generate Input data
@@ -264,162 +261,48 @@ static void display_matrix(void)
 }
 }
 #endif
 #endif
 
 
-/*
- *	Main loop
- */
-
-static int cg(void)
+static void display_x_result(void)
 {
 {
-	double delta_new, delta_0;
-
-	int i = 0;
-	int ret;
+	int j, i;
+	starpu_data_handle_t sub;
 
 
-	/* r <- b */
-	ret = copy_handle(r_handle, b_handle, nblocks);
-	if (ret == -ENODEV) return ret;
+	FPRINTF(stderr, "Computed X vector:\n");
 
 
-	/* r <- r - A x */
-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
+	int block_size = n / nblocks;
 
 
-	/* d <- r */
-	ret = copy_handle(d_handle, r_handle, nblocks);
-	if (ret == -ENODEV) return ret;
-
-	/* delta_new = dot(r,r) */
-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
-
-	starpu_data_acquire(rtr_handle, STARPU_R);
-	delta_new = rtr;
-	delta_0 = delta_new;
-	starpu_data_release(rtr_handle);
-
-	FPRINTF(stderr, "*************** INITIAL ************ \n");
-	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
-
-	double start;
-	double end;
-	start = starpu_timing_now();
-
-	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	for (j = 0; j < nblocks; j++)
 	{
 	{
-		double delta_old;
-		double alpha, beta;
-
-		starpu_iteration_push(i);
-
-		/* q <- A d */
-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
-
-		/* dtq <- dot(d,q) */
-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
-
-		/* alpha = delta_new / dtq */
-		starpu_data_acquire(dtq_handle, STARPU_R);
-		alpha = delta_new/dtq;
-		starpu_data_release(dtq_handle);
-
-		/* x <- x + alpha d */
-		axpy_kernel(x_handle, d_handle, alpha, nblocks);
-
-		if ((i % 50) == 0)
-		{
-			/* r <- b */
-			copy_handle(r_handle, b_handle, nblocks);
-
-			/* r <- r - A x */
-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-		}
-		else
-		{
-			/* r <- r - alpha q */
-			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
-		}
-
-		/* delta_new = dot(r,r) */
-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-
-		starpu_data_acquire(rtr_handle, STARPU_R);
-		delta_old = delta_new;
-		delta_new = rtr;
-		beta = delta_new / delta_old;
-		starpu_data_release(rtr_handle);
-
-		/* d <- beta d + r */
-		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
-
-		if ((i % 10) == 0)
+		sub = starpu_data_get_sub_data(x_handle, 1, j);
+		starpu_data_acquire(sub, STARPU_R);
+		for (i = 0; i < block_size; i++)
 		{
 		{
-			/* We here take the error as ||r||_2 / (n||b||_2) */
-			double error = sqrt(delta_new/delta_0)/(1.0*n);
-			FPRINTF(stderr, "*****************************************\n");
-			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+			FPRINTF(stderr, "% 02.2e\n", x[j*block_size + i]);
 		}
 		}
-
-		starpu_iteration_pop();
-		i++;
+		starpu_data_release(sub);
 	}
 	}
-
-	end = starpu_timing_now();
-
-	double timing = end - start;
-	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
-	return 0;
 }
 }
 
 
-static int check(void)
-{
-	return 0;
-}
 
 
 static void parse_args(int argc, char **argv)
 static void parse_args(int argc, char **argv)
 {
 {
 	int i;
 	int i;
 	for (i = 1; i < argc; i++)
 	for (i = 1; i < argc; i++)
 	{
 	{
-	        if (strcmp(argv[i], "-n") == 0)
-		{
-			n = (int long long)atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-maxiter") == 0)
-		{
-			i_max = atoi(argv[++i]);
-			if (i_max <= 0)
-			{
-				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
-				exit(EXIT_FAILURE);
-			}
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-nblocks") == 0)
-		{
-			nblocks = atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-no-reduction") == 0)
-		{
-			use_reduction = 0;
-			continue;
-		}
-
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
 		{
-			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 			exit(-1);
 		}
 		}
-        }
+	}
+
+	parse_common_args(argc, argv);
 }
 }
 
 
+
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int ret;
 	int ret;
+	double start, end;
 
 
 	/* Not supported yet */
 	/* Not supported yet */
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
@@ -434,9 +317,19 @@ int main(int argc, char **argv)
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
+	FPRINTF(stderr, "************** PARAMETERS ***************\n");
+	FPRINTF(stderr, "Problem size (-n): %lld\n", n);
+	FPRINTF(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
+	FPRINTF(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
+	FPRINTF(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
+
+	start = starpu_timing_now();
 	generate_random_problem();
 	generate_random_problem();
 	register_data();
 	register_data();
 	partition_data();
 	partition_data();
+	end = starpu_timing_now();
+
+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/10e6);
 
 
 	ret = cg();
 	ret = cg();
 	if (ret == -ENODEV)
 	if (ret == -ENODEV)
@@ -445,10 +338,13 @@ int main(int argc, char **argv)
 		goto enodev;
 		goto enodev;
 	}
 	}
 
 
-	ret = check();
-
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
+	if (display_result)
+	{
+		display_x_result();
+	}
+
 enodev:
 enodev:
 	unregister_data();
 	unregister_data();
 	free_data();
 	free_data();

+ 0 - 25
examples/cg/cg.h

@@ -54,29 +54,4 @@
 #define cublasscal	cublasSscal
 #define cublasscal	cublasSscal
 #endif
 #endif
 
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
-	       starpu_data_handle_t s,
-	       unsigned nblocks,
-	       int use_reduction);
-
-int gemv_kernel(starpu_data_handle_t v1,
-                starpu_data_handle_t matrix, 
-                starpu_data_handle_t v2,
-                TYPE p1, TYPE p2,
-		unsigned nblocks,
-		int use_reduction);
-
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
-		unsigned nblocks);
-
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
-		     unsigned nblocks);
-
-int copy_handle(starpu_data_handle_t dst,
-		starpu_data_handle_t src,
-		unsigned nblocks);
-
 #endif /* __STARPU_EXAMPLE_CG_H__ */
 #endif /* __STARPU_EXAMPLE_CG_H__ */

+ 216 - 37
examples/cg/cg_kernels.c

@@ -23,11 +23,43 @@
 #include <limits.h>
 #include <limits.h>
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+#include <cuda.h>
 #include <starpu_cublas_v2.h>
 #include <starpu_cublas_v2.h>
 static const TYPE gp1 = 1.0;
 static const TYPE gp1 = 1.0;
 static const TYPE gm1 = -1.0;
 static const TYPE gm1 = -1.0;
 #endif
 #endif
 
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+static int nblocks = 8;
+
+#ifdef STARPU_QUICK_CHECK
+static int i_max = 5;
+static int long long n = 2048;
+#elif !defined(STARPU_LONG_CHECK)
+static int long long n = 4096;
+static int i_max = 100;
+#else
+static int long long n = 4096;
+static int i_max = 1000;
+#endif
+static double eps = (10e-14);
+
+int use_reduction = 1;
+int display_result = 0;
+
+HANDLE_TYPE_MATRIX A_handle;
+HANDLE_TYPE_VECTOR b_handle;
+HANDLE_TYPE_VECTOR x_handle;
+
+HANDLE_TYPE_VECTOR r_handle;
+HANDLE_TYPE_VECTOR d_handle;
+HANDLE_TYPE_VECTOR q_handle;
+
+starpu_data_handle_t dtq_handle;
+starpu_data_handle_t rtr_handle;
+TYPE dtq, rtr;
+
 #if 0
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 {
 {
@@ -120,7 +152,7 @@ struct starpu_codelet accumulate_variable_cl =
 	.cuda_funcs = {accumulate_variable_cuda},
 	.cuda_funcs = {accumulate_variable_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
 	.model = &accumulate_variable_model
 };
 };
@@ -164,7 +196,7 @@ struct starpu_codelet accumulate_vector_cl =
 	.cuda_funcs = {accumulate_vector_cuda},
 	.cuda_funcs = {accumulate_vector_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
 	.model = &accumulate_vector_model
 };
 };
@@ -314,8 +346,8 @@ static struct starpu_codelet dot_kernel_cl =
 	.model = &dot_kernel_model
 	.model = &dot_kernel_model
 };
 };
 
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
+int dot_kernel(HANDLE_TYPE_VECTOR v1,
+	       HANDLE_TYPE_VECTOR v2,
 	       starpu_data_handle_t s,
 	       starpu_data_handle_t s,
 	       unsigned nblocks,
 	       unsigned nblocks,
 	       int use_reduction)
 	       int use_reduction)
@@ -327,21 +359,21 @@ int dot_kernel(starpu_data_handle_t v1,
 		starpu_data_invalidate_submit(s);
 		starpu_data_invalidate_submit(s);
 	else
 	else
 	{
 	{
-		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
+		ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 
 
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
-		ret = starpu_task_insert(&dot_kernel_cl,
+		ret = TASK_INSERT(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
-					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 					 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 	return 0;
 	return 0;
 }
 }
@@ -477,9 +509,9 @@ static struct starpu_codelet gemv_kernel_cl =
 	.model = &gemv_kernel_model
 	.model = &gemv_kernel_model
 };
 };
 
 
-int gemv_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t matrix,
-		starpu_data_handle_t v2,
+int gemv_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_MATRIX matrix,
+		HANDLE_TYPE_VECTOR v2,
 		TYPE p1, TYPE p2,
 		TYPE p1, TYPE p2,
 		unsigned nblocks,
 		unsigned nblocks,
 		int use_reduction)
 		int use_reduction)
@@ -489,13 +521,13 @@ int gemv_kernel(starpu_data_handle_t v1,
 
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
 	{
-		ret = starpu_task_insert(&scal_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+		ret = TASK_INSERT(&scal_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
 					 0);
 					 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -503,15 +535,15 @@ int gemv_kernel(starpu_data_handle_t v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 		{
 			TYPE one = 1.0;
 			TYPE one = 1.0;
-			ret = starpu_task_insert(&gemv_kernel_cl,
-						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+			ret = TASK_INSERT(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	GET_VECTOR_BLOCK(v1, b2),
+						 STARPU_R,	GET_MATRIX_BLOCK(matrix, b2, b1),
+						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
 						 0);
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		}
 		}
 	}
 	}
 	return 0;
 	return 0;
@@ -582,23 +614,23 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.model = &scal_axpy_kernel_model
 	.model = &scal_axpy_kernel_model
 };
 };
 
 
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
+int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
+		     HANDLE_TYPE_VECTOR v2, TYPE p2,
 		     unsigned nblocks)
 		     unsigned nblocks)
 {
 {
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
 		int ret;
 		int ret;
-		ret = starpu_task_insert(&scal_axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&scal_axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 					 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 	return 0;
 	return 0;
 }
 }
@@ -661,30 +693,177 @@ static struct starpu_codelet axpy_kernel_cl =
 	.model = &axpy_kernel_model
 	.model = &axpy_kernel_model
 };
 };
 
 
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
+int axpy_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_VECTOR v2, TYPE p1,
 		unsigned nblocks)
 		unsigned nblocks)
 {
 {
 	unsigned b;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	for (b = 0; b < nblocks; b++)
 	{
 	{
 		int ret;
 		int ret;
-		ret = starpu_task_insert(&axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 					 0);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
-int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+
+/*
+ *	Main loop
+ */
+int cg(void)
 {
 {
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	TYPE delta_new, delta_0, error, delta_old, alpha, beta;
+	double start, end, timing;
+	int i = 0, ret;
+
+	/* r <- b */
+	ret = copy_handle(r_handle, b_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* r <- r - A x */
+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+	if (ret == -ENODEV) return ret;
+
+	/* d <- r */
+	ret = copy_handle(d_handle, r_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* delta_new = dot(r,r) */
+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+	if (ret == -ENODEV) return ret;
+
+	GET_DATA_HANDLE(rtr_handle);
+	starpu_data_acquire(rtr_handle, STARPU_R);
+	delta_new = rtr;
+	delta_0 = delta_new;
+	starpu_data_release(rtr_handle);
+
+	FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
+
+	FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
+	FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
+
+	BARRIER();
+	start = starpu_timing_now();
+
+	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	{
+		starpu_iteration_push(i);
+
+		/* q <- A d */
+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
+
+		/* dtq <- dot(d,q) */
+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
+
+		/* alpha = delta_new / dtq */
+		GET_DATA_HANDLE(dtq_handle);
+		starpu_data_acquire(dtq_handle, STARPU_R);
+		alpha = delta_new / dtq;
+		starpu_data_release(dtq_handle);
+
+		/* x <- x + alpha d */
+		axpy_kernel(x_handle, d_handle, alpha, nblocks);
+
+		if ((i % 50) == 0)
+		{
+			/* r <- b */
+			copy_handle(r_handle, b_handle, nblocks);
+
+			/* r <- r - A x */
+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+		}
+		else
+		{
+			/* r <- r - alpha q */
+			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
+		}
+
+		/* delta_new = dot(r,r) */
+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+
+		GET_DATA_HANDLE(rtr_handle);
+		starpu_data_acquire(rtr_handle, STARPU_R);
+		delta_old = delta_new;
+		delta_new = rtr;
+		beta = delta_new / delta_old;
+		starpu_data_release(rtr_handle);
+
+		/* d <- beta d + r */
+		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
+
+		if ((i % 10) == 0)
+		{
+			/* We here take the error as ||r||_2 / (n||b||_2) */
+			error = sqrt(delta_new/delta_0)/(1.0*n);
+			FPRINTF_SERVER(stderr, "*****************************************\n");
+			FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+		}
+
+		starpu_iteration_pop();
+		i++;
+	}
+
+	BARRIER();
+	end = starpu_timing_now();
+	timing = end - start;
+
+	error = sqrt(delta_new/delta_0)/(1.0*n);
+	FPRINTF_SERVER(stderr, "*****************************************\n");
+	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
+
 	return 0;
 	return 0;
 }
 }
+
+
+void parse_common_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-n") == 0)
+		{
+			n = (int long long)atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-display-result") == 0)
+		{
+			display_result = 1;
+			continue;
+		}
+
+		if (strcmp(argv[i], "-maxiter") == 0)
+		{
+			i_max = atoi(argv[++i]);
+			if (i_max <= 0)
+			{
+				FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
+				exit(EXIT_FAILURE);
+			}
+			continue;
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			nblocks = atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-no-reduction") == 0)
+		{
+			use_reduction = 0;
+			continue;
+		}
+	}
+}

+ 1 - 1
examples/pi/pi_redux.c

@@ -322,7 +322,7 @@ static struct starpu_codelet redux_codelet =
 	.cuda_funcs = {redux_cuda_func},
 	.cuda_funcs = {redux_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 

+ 1 - 1
examples/reductions/dot_product.c

@@ -211,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 	.opencl_funcs = {redux_opencl_func},
 	.opencl_funcs = {redux_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.name = "redux"
 	.name = "redux"
 };
 };

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 {
 {
 	.cpu_funcs = {minmax_redux_cpu_func},
 	.cpu_funcs = {minmax_redux_cpu_func},
 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.name = "redux"
 	.name = "redux"
 };
 };

+ 13 - 0
include/fstarpu_mod.f90

@@ -25,6 +25,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_RW
         type(c_ptr), bind(C) :: FSTARPU_RW
         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
         type(c_ptr), bind(C) :: FSTARPU_REDUX
         type(c_ptr), bind(C) :: FSTARPU_REDUX
+        type(c_ptr), bind(C) :: FSTARPU_MPI_REDUX
         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
         type(c_ptr), bind(C) :: FSTARPU_SSEND
         type(c_ptr), bind(C) :: FSTARPU_SSEND
         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
@@ -36,11 +37,15 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
@@ -2395,6 +2400,7 @@ module fstarpu_mod
                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
+                        FSTARPU_MPI_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_MPI_REDUX"//C_NULL_CHAR)
                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
@@ -2406,12 +2412,19 @@ module fstarpu_mod
                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_WITH_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_ARG_NFREE    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE   = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)

+ 8 - 0
include/starpu.h

@@ -471,6 +471,14 @@ struct starpu_conf
 	   Maximum spinning backoff of drivers. Default value: \c 32
 	   Maximum spinning backoff of drivers. Default value: \c 32
 	 */
 	 */
 	unsigned driver_spinning_backoff_max;
 	unsigned driver_spinning_backoff_max;
+
+	/**
+	   Specify if CUDA workers should do only fast allocations
+	   when running the datawizard progress of
+	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+	   Default value is 0, allowing CUDA workers to do slow allocations.
+	 */
+	int cuda_only_fast_alloc_other_memnodes;
 };
 };
 
 
 /**
 /**

+ 14 - 4
include/starpu_data.h

@@ -110,7 +110,15 @@ enum starpu_data_access_mode
 				   src/sched_policies/work_stealing_policy.c
 				   src/sched_policies/work_stealing_policy.c
 				   source code.
 				   source code.
 				*/
 				*/
-	STARPU_ACCESS_MODE_MAX=(1<<7) /**< todo */
+	STARPU_MPI_REDUX=(1<<7), /** Inter-node reduction only. Codelets 
+				    contributing to these reductions should
+				    be registered with STARPU_RW | STARPU_COMMUTE 
+				    access modes.
+			            When inserting these tasks through the
+				    MPI layer however, the access mode needs
+				    to be STARPU_MPI_REDUX. */
+	STARPU_ACCESS_MODE_MAX=(1<<8) /** The purpose of ACCESS_MODE_MAX is to
+					be the maximum of this enum. */
 };
 };
 
 
 struct starpu_data_interface_ops;
 struct starpu_data_interface_ops;
@@ -305,7 +313,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 
 
    This is a very internal interface, subject to changes, do not use this.
    This is a very internal interface, subject to changes, do not use this.
 */
 */
-int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid);
+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid, int prio);
 
 
 /**
 /**
    The application can call this function instead of starpu_data_acquire() so as to
    The application can call this function instead of starpu_data_acquire() so as to
@@ -560,8 +568,10 @@ struct starpu_codelet;
 /**
 /**
    Set the codelets to be used for \p handle when it is accessed in the
    Set the codelets to be used for \p handle when it is accessed in the
    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
-   the codelet \p init_cl, and reduction between per-worker buffers will be
-   done with the codelet \p redux_cl.
+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
+   reduction between per-worker buffers will be done with the codelet \p
+   redux_cl (which has to take a first accumulation handle with
+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
 */
 */
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 
 

+ 8 - 0
include/starpu_hash.h

@@ -39,6 +39,14 @@ extern "C"
 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
 
 
 /**
 /**
+   Compute the CRC of a pointer value seeded by the \p inputcrc
+   <em>current state</em>. The return value should be considered as the new
+   <em>current state</em> for future CRC computation. This is used for computing
+   data size footprint.
+*/
+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc);
+
+/**
    Compute the CRC of a 32bit number seeded by the \p inputcrc
    Compute the CRC of a 32bit number seeded by the \p inputcrc
    <em>current state</em>. The return value should be considered as the new
    <em>current state</em>. The return value should be considered as the new
    <em>current state</em> for future CRC computation. This is used for computing
    <em>current state</em> for future CRC computation. This is used for computing

+ 3 - 5
include/starpu_perfmodel.h

@@ -310,10 +310,10 @@ struct starpu_perfmodel
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 
 
 /**
 /**
-   Deinitialize the \p model performance model structure. You need to call this 
-   before deallocating the structure. You will probably want to call 
+   Deinitialize the \p model performance model structure. You need to call this
+   before deallocating the structure. You will probably want to call
    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
-*/   
+*/
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 
 /**
 /**
@@ -322,7 +322,6 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration will be run
    - \p archi is the type of architecture on which calibration will be run
 */
 */
-
 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
 
 /**
 /**
@@ -335,7 +334,6 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration was run
    - \p archi is the type of architecture on which calibration was run
 */
 */
-
 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
 
 
 
 

+ 28 - 2
include/starpu_task.h

@@ -861,7 +861,28 @@ struct starpu_task
 	*/
 	*/
 	void *prologue_callback_arg;
 	void *prologue_callback_arg;
 
 
+	/** Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void*)</c>
+	   which specifies a possible callback. If this pointer is
+	   non-<c>NULL</c>, the callback function is executed on the host
+	   when the task is pop-ed from the scheduler, just before getting
+	   executed. The callback is passed the value contained in the
+	   starpu_task::prologue_callback_pop_arg field.
+	   No callback is executed if the field is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP followed by the function pointer.
+	*/
 	void (*prologue_callback_pop_func)(void *);
 	void (*prologue_callback_pop_func)(void *);
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is
+	   the pointer passed to the prologue_callback_pop function. This
+	   field is ignored if the field
+	   starpu_task::prologue_callback_pop_func is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG followed by the argument.
+	   */
 	void *prologue_callback_pop_arg;
 	void *prologue_callback_pop_arg;
 
 
 	/**
 	/**
@@ -1424,8 +1445,13 @@ struct starpu_task
 	do {								\
 	do {								\
 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
-		else							\
-			STARPU_CODELET_SET_MODE((task)->cl, mode, i);	\
+		else \
+		{							\
+			enum starpu_data_access_mode cl_mode = STARPU_CODELET_GET_MODE((task)->cl, i); \
+			STARPU_ASSERT_MSG(cl_mode == mode,	\
+				"Task <%s> can't set its  %d-th buffer mode to %d as the codelet it derives from uses %d", \
+				(task)->cl->name, i, mode, cl_mode);	\
+		} \
 	} while(0)
 	} while(0)
 
 
 /**
 /**

+ 4 - 0
include/starpu_util.h

@@ -257,6 +257,10 @@ extern "C"
 	_starpu_abort();				\
 	_starpu_abort();				\
 } while(0)
 } while(0)
 
 
+#if defined(_MSC_VER)
+  #undef STARPU_HAVE_STRERROR_R
+#endif
+
 #if defined(STARPU_HAVE_STRERROR_R)
 #if defined(STARPU_HAVE_STRERROR_R)
 #if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
 #if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
 /* XSI-compliant version of strerror_r returns an int */
 /* XSI-compliant version of strerror_r returns an int */

+ 2 - 2
julia/README

@@ -20,8 +20,8 @@ $ make
 Then, you need to add the lib/ directory to your library path and the julia/
 Then, you need to add the lib/ directory to your library path and the julia/
 directory to your Julia load path:
 directory to your Julia load path:
 
 
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
-$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/src/.lib
+$ export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
 
 
 This step can also be done by sourcing the setenv.sh script:
 This step can also be done by sourcing the setenv.sh script:
 
 

+ 2 - 2
julia/examples/execute.sh.in

@@ -1,7 +1,7 @@
 #!@REALBASH@
 #!@REALBASH@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,7 +16,7 @@
 #
 #
 
 
 set -x
 set -x
-export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia:$JULIA_LOAD_PATH
+export JULIA_LOAD_PATH=@STARPU_SRC_DIR@/julia/src:$JULIA_LOAD_PATH
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_BUILD_DIR=@STARPU_BUILD_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
 export STARPU_SRC_DIR=@STARPU_SRC_DIR@
 export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3
 export STARPU_JULIA_LIB=@STARPU_BUILD_DIR@/julia/src/.libs/libstarpujulia-1.3

+ 2 - 2
julia/setenv.sh

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -13,7 +13,7 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 #
-export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
+export JULIA_LOAD_PATH=$PWD/src:$JULIA_LOAD_PATH
 
 
 if [ `uname` == "Darwin" ]; then
 if [ `uname` == "Darwin" ]; then
     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$PWD/lib/
     export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$PWD/lib/

+ 1 - 1
julia/src/StarPU.jl

@@ -65,7 +65,7 @@ export STARPU_HISTORY_BASED, STARPU_REGRESSION_BASED
 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
 export STARPU_NL_REGRESSION_BASED, STARPU_MULTIPLE_REGRESSION_BASED
 export starpu_tag_t
 export starpu_tag_t
 export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
 export STARPU_NONE,STARPU_R,STARPU_W,STARPU_RW, STARPU_SCRATCH
-export STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
+export STARPU_MPI_REDUX, STARPU_REDUX,STARPU_COMMUTE, STARPU_SSEND, STARPU_LOCALITY
 export STARPU_ACCESS_MODE_MAX
 export STARPU_ACCESS_MODE_MAX
 
 
 # BLAS
 # BLAS

+ 89 - 2
mpi/examples/Makefile.am

@@ -272,9 +272,27 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky_distributed
 	matrix_decomposition/mpi_cholesky_distributed
 endif
 endif
 
 
-########################
+##############
+# CG example #
+##############
+
+if !STARPU_SIMGRID
+if !STARPU_NO_BLAS_LIB
+examplebin_PROGRAMS += cg/cg
+starpu_mpi_EXAMPLES += cg/cg
+
+cg_cg_SOURCES =					\
+	cg/cg.c						\
+	../../examples/common/blas.c
+
+cg_cg_LDADD =					\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+endif
+
+###########################
 # MPI Matrix mult example #
 # MPI Matrix mult example #
-########################
+###########################
 
 
 examplebin_PROGRAMS +=		\
 examplebin_PROGRAMS +=		\
 	matrix_mult/mm
 	matrix_mult/mm
@@ -290,6 +308,24 @@ starpu_mpi_EXAMPLES +=				\
 	matrix_mult/mm
 	matrix_mult/mm
 endif
 endif
 
 
+########################
+# MPI STARPU_MPI_REDUX #
+########################
+
+examplebin_PROGRAMS +=		\
+	mpi_redux/mpi_redux
+
+mpi_redux_mpi_redux_SOURCES	=		\
+	mpi_redux/mpi_redux.c
+
+mpi_redux_mpi_redux_LDADD =			\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	mpi_redux/mpi_redux
+endif
+
 ##########################################
 ##########################################
 # Native Fortran MPI Matrix mult example #
 # Native Fortran MPI Matrix mult example #
 ##########################################
 ##########################################
@@ -336,6 +372,55 @@ endif
 endif
 endif
 endif
 endif
 
 
+########################################
+# Native Fortran MPI STARPU_REDUX test #
+########################################
+
+if STARPU_HAVE_MPIFORT
+if !STARPU_SANITIZE
+examplebin_PROGRAMS +=		\
+	native_fortran/nf_mpi_redux
+
+native_fortran_nf_mpi_redux_SOURCES	=			\
+	native_fortran/fstarpu_mpi_mod.f90	\
+	native_fortran/fstarpu_mod.f90		\
+	native_fortran/nf_mpi_redux.f90	
+
+native_fortran_nf_mpi_redux_LDADD =					\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	native_fortran/nf_mpi_redux
+endif
+endif
+endif
+
+########################################
+# Native Fortran MPI STARPU_REDUX test #
+########################################
+
+if STARPU_HAVE_MPIFORT
+if !STARPU_SANITIZE
+examplebin_PROGRAMS +=		\
+	native_fortran/nf_redux_test
+
+native_fortran_nf_redux_test_SOURCES	=			\
+	native_fortran/fstarpu_mpi_mod.f90	\
+	native_fortran/fstarpu_mod.f90		\
+	native_fortran/nf_redux_test.f90	
+
+native_fortran_nf_redux_test_LDADD =					\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	native_fortran/nf_redux_test
+endif
+endif
+endif
+
+
 ###################
 ###################
 # complex example #
 # complex example #
 ###################
 ###################
@@ -427,6 +512,8 @@ native_fortran/nf_mm_cl.o: fstarpu_mod.mod
 native_fortran/nf_mm.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_mm.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
+native_fortran/nf_redux_test.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
+native_fortran/nf_mpi_redux.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
 endif
 endif
 
 

+ 422 - 0
mpi/examples/cg/cg.c

@@ -0,0 +1,422 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <assert.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <common/blas.h>
+
+/*
+ * Distributed version of Conjugate Gradient implemented in examples/cg/cg.c
+ *
+ * Use -display-result option and compare with the non-distributed version: the
+ * x vector should be the same.
+ */
+
+#include "../../../examples/cg/cg.h"
+
+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks);
+
+#define HANDLE_TYPE_VECTOR starpu_data_handle_t*
+#define HANDLE_TYPE_MATRIX starpu_data_handle_t**
+#define TASK_INSERT(cl, ...) starpu_mpi_task_insert(MPI_COMM_WORLD, cl, ##__VA_ARGS__)
+#define GET_VECTOR_BLOCK(v, i) v[i]
+#define GET_MATRIX_BLOCK(m, i, j) m[i][j]
+#define BARRIER() starpu_mpi_barrier(MPI_COMM_WORLD);
+#define GET_DATA_HANDLE(handle) starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, handle)
+
+static int block_size;
+
+static int rank;
+static int nodes_p = 2;
+static int nodes_q;
+
+static TYPE ***A;
+static TYPE **x;
+static TYPE **b;
+
+static TYPE **r;
+static TYPE **d;
+static TYPE **q;
+
+#define FPRINTF_SERVER(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT") && rank == 0) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#include "../../../examples/cg/cg_kernels.c"
+
+static int my_distrib(const int y, const int x)
+{
+	return (y%nodes_q)*nodes_p + (x%nodes_p);
+}
+
+static int copy_handle(starpu_data_handle_t* dst, starpu_data_handle_t* src, unsigned nblocks)
+{
+	unsigned b;
+
+	for (b = 0; b < nblocks; b++)
+	{
+		if (rank == my_distrib(b, 0))
+		{
+			starpu_data_cpy(dst[b], src[b], /* asynchronous */ 1, /* without callback */ NULL, NULL);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Generate Input data
+ */
+static void generate_random_problem(void)
+{
+	unsigned nn, mm, m, n, mpi_rank;
+
+	A = malloc(nblocks * sizeof(TYPE **));
+	x = malloc(nblocks * sizeof(TYPE *));
+	b = malloc(nblocks * sizeof(TYPE *));
+
+	r = malloc(nblocks * sizeof(TYPE *));
+	d = malloc(nblocks * sizeof(TYPE *));
+	q = malloc(nblocks * sizeof(TYPE *));
+
+	for (m = 0; m < nblocks; m++)
+	{
+		A[m] = malloc(nblocks * sizeof(TYPE*));
+
+		mpi_rank = my_distrib(m, 0);
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_malloc((void**) &x[m], block_size*sizeof(TYPE));
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_malloc((void**) &b[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &r[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &d[m], block_size*sizeof(TYPE));
+			starpu_malloc((void**) &q[m], block_size*sizeof(TYPE));
+
+			for (mm = 0; mm < block_size; mm++)
+			{
+				x[m][mm] = (TYPE) 0.0;
+				b[m][mm] = (TYPE) 1.0;
+				r[m][mm] = (TYPE) 0.0;
+				d[m][mm] = (TYPE) 0.0;
+				q[m][mm] = (TYPE) 0.0;
+			}
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+			if (mpi_rank == rank)
+			{
+				starpu_malloc((void**) &A[m][n], block_size*block_size*sizeof(TYPE));
+
+				for (nn = 0; nn < block_size; nn++)
+				{
+					for (mm = 0; mm < block_size; mm++)
+					{
+						/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
+						A[m][n][mm + nn*block_size] = (TYPE) (1.0/(1.0+(nn+(m*block_size)+mm+(n*block_size))));
+					}
+				}
+			}
+		}
+	}
+}
+
+static void free_data(void)
+{
+	unsigned nn, mm, m, n, mpi_rank;
+
+	for (m = 0; m < nblocks; m++)
+	{
+		mpi_rank = my_distrib(m, 0);
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_free((void*) x[m]);
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_free((void*) b[m]);
+			starpu_free((void*) r[m]);
+			starpu_free((void*) d[m]);
+			starpu_free((void*) q[m]);
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+			if (mpi_rank == rank)
+			{
+				starpu_free((void*) A[m][n]);
+			}
+		}
+
+		free(A[m]);
+	}
+
+	free(A);
+	free(x);
+	free(b);
+	free(r);
+	free(d);
+	free(q);
+}
+
+static void register_data(void)
+{
+	unsigned m, n;
+	int mpi_rank;
+	starpu_mpi_tag_t mpi_tag = 0;
+
+	A_handle = malloc(nblocks*sizeof(starpu_data_handle_t*));
+	x_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	b_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	r_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	d_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+	q_handle = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+	for (m = 0; m < nblocks; m++)
+	{
+		mpi_rank = my_distrib(m, 0);
+		A_handle[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+		if (mpi_rank == rank || display_result)
+		{
+			starpu_vector_data_register(&x_handle[m], STARPU_MAIN_RAM, (uintptr_t) x[m], block_size, sizeof(TYPE));
+		}
+		else if (!display_result)
+		{
+			assert(mpi_rank != rank);
+			starpu_vector_data_register(&x_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+		}
+
+		if (mpi_rank == rank)
+		{
+			starpu_vector_data_register(&b_handle[m], STARPU_MAIN_RAM, (uintptr_t) b[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[m], STARPU_MAIN_RAM, (uintptr_t) r[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[m], STARPU_MAIN_RAM, (uintptr_t) d[m], block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[m], STARPU_MAIN_RAM, (uintptr_t) q[m], block_size, sizeof(TYPE));
+		}
+		else
+		{
+			starpu_vector_data_register(&b_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&r_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&d_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+			starpu_vector_data_register(&q_handle[m], -1, (uintptr_t) NULL, block_size, sizeof(TYPE));
+		}
+
+		starpu_data_set_coordinates(x_handle[m], 1, m);
+		starpu_mpi_data_register(x_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(b_handle[m], 1, m);
+		starpu_mpi_data_register(b_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(r_handle[m], 1, m);
+		starpu_mpi_data_register(r_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(d_handle[m], 1, m);
+		starpu_mpi_data_register(d_handle[m], ++mpi_tag, mpi_rank);
+		starpu_data_set_coordinates(q_handle[m], 1, m);
+		starpu_mpi_data_register(q_handle[m], ++mpi_tag, mpi_rank);
+
+		if (use_reduction)
+		{
+			starpu_data_set_reduction_methods(q_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
+			starpu_data_set_reduction_methods(r_handle[m], &accumulate_vector_cl, &bzero_vector_cl);
+		}
+
+		for (n = 0; n < nblocks; n++)
+		{
+			mpi_rank = my_distrib(m, n);
+
+			if (mpi_rank == rank)
+			{
+				starpu_matrix_data_register(&A_handle[m][n], STARPU_MAIN_RAM, (uintptr_t) A[m][n], block_size, block_size, block_size, sizeof(TYPE));
+			}
+			else
+			{
+				starpu_matrix_data_register(&A_handle[m][n], -1, (uintptr_t) NULL, block_size, block_size, block_size, sizeof(TYPE));
+			}
+
+			starpu_data_set_coordinates(A_handle[m][n], 2, n, m);
+			starpu_mpi_data_register(A_handle[m][n], ++mpi_tag, mpi_rank);
+		}
+	}
+
+	starpu_variable_data_register(&dtq_handle, STARPU_MAIN_RAM, (uintptr_t)&dtq, sizeof(TYPE));
+	starpu_variable_data_register(&rtr_handle, STARPU_MAIN_RAM, (uintptr_t)&rtr, sizeof(TYPE));
+	starpu_mpi_data_register(rtr_handle, ++mpi_tag, 0);
+	starpu_mpi_data_register(dtq_handle, ++mpi_tag, 0);
+
+	if (use_reduction)
+	{
+		starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
+		starpu_data_set_reduction_methods(rtr_handle, &accumulate_variable_cl, &bzero_variable_cl);
+	}
+}
+
+static void unregister_data(void)
+{
+	unsigned m, n;
+
+	for (m = 0; m < nblocks; m++)
+	{
+		starpu_data_unregister(x_handle[m]);
+		starpu_data_unregister(b_handle[m]);
+		starpu_data_unregister(r_handle[m]);
+		starpu_data_unregister(d_handle[m]);
+		starpu_data_unregister(q_handle[m]);
+
+		for (n = 0; n < nblocks; n++)
+		{
+			starpu_data_unregister(A_handle[m][n]);
+		}
+
+		free(A_handle[m]);
+	}
+
+	starpu_data_unregister(dtq_handle);
+	starpu_data_unregister(rtr_handle);
+
+	free(A_handle);
+	free(x_handle);
+	free(b_handle);
+	free(r_handle);
+	free(d_handle);
+	free(q_handle);
+}
+
+static void display_x_result(void)
+{
+	int j, i;
+
+	for (j = 0; j < nblocks; j++)
+	{
+		starpu_mpi_get_data_on_node(MPI_COMM_WORLD, x_handle[j], 0);
+	}
+
+	if (rank == 0)
+	{
+		FPRINTF_SERVER(stderr, "Computed X vector:\n");
+		for (j = 0; j < nblocks; j++)
+		{
+			starpu_data_acquire(x_handle[j], STARPU_R);
+			for (i = 0; i < block_size; i++)
+			{
+				FPRINTF(stderr, "% 02.2e\n", x[j][i]);
+			}
+			starpu_data_release(x_handle[j]);
+		}
+	}
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-p") == 0)
+		{
+			nodes_p = atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
+		{
+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-p node_grid_width] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			exit(-1);
+		}
+	}
+
+	parse_common_args(argc, argv);
+}
+
+int main(int argc, char **argv)
+{
+	int worldsize, ret;
+	double start, end;
+
+	/* Not supported yet */
+	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
+		return 77;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	parse_args(argc, argv);
+
+	if (worldsize % nodes_p != 0)
+	{
+		FPRINTF_SERVER(stderr, "Node grid (%d) width must divide the number of nodes (%d).\n", nodes_p, worldsize);
+		starpu_mpi_shutdown();
+		return 1;
+	}
+	nodes_q = worldsize / nodes_p;
+
+	if (n % nblocks != 0)
+	{
+		FPRINTF_SERVER(stderr, "The number of blocks (%d) must divide the matrix size (%lld).\n", nblocks, n);
+		starpu_mpi_shutdown();
+		return 1;
+	}
+	block_size = n / nblocks;
+
+	starpu_cublas_init();
+
+	FPRINTF_SERVER(stderr, "************** PARAMETERS ***************\n");
+	FPRINTF_SERVER(stderr, "%d nodes (%dx%d)\n", worldsize, nodes_p, nodes_q);
+	FPRINTF_SERVER(stderr, "Problem size (-n): %lld\n", n);
+	FPRINTF_SERVER(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
+	FPRINTF_SERVER(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
+	FPRINTF_SERVER(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	start = starpu_timing_now();
+	generate_random_problem();
+	register_data();
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	end = starpu_timing_now();
+
+	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/10e6);
+
+	ret = cg();
+	if (ret == -ENODEV)
+	{
+		ret = 77;
+		goto enodev;
+	}
+
+	starpu_task_wait_for_all();
+
+	if (display_result)
+	{
+		display_x_result();
+	}
+
+enodev:
+	unregister_data();
+	free_data();
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	return ret;
+}

+ 201 - 0
mpi/examples/mpi_redux/mpi_redux.c

@@ -0,0 +1,201 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example illustrates how to use the STARPU_MPI_REDUX mode
+ * and compare it with the standard STARPU_REDUX.
+ *
+ * In order to make this comparison salliant, the init codelet is not
+ * a task that set the handle to a neutral element but rather depends
+ * on the working node.
+ * This is not a proper way to use a reduction pattern however it
+ * can be analogous to the cost/weight of each contribution.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+#include <unistd.h>
+
+static void cl_cpu_work(void *handles[], void*arg)
+{
+	(void)arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	*a = 3.0 + *a + *b;
+	printf("%f\n",*a);
+}
+
+static struct starpu_codelet work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_REDUX, STARPU_R },
+	.name = "task_init"
+};
+
+static struct starpu_codelet mpi_work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
+	.name = "task_init-mpi"
+};
+
+static void cl_cpu_task_init(void *handles[], void*arg)
+{
+	(void) arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	sleep(1);
+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), *a);
+	*a = starpu_mpi_world_rank();
+}
+
+static struct starpu_codelet task_init_cl =
+{
+	.cpu_funcs = { cl_cpu_task_init },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+	.name = "task_init"
+};
+
+static void cl_cpu_task_red(void *handles[], void*arg)
+{
+	(void) arg;
+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	*ad = *ad + *as;
+}
+
+static struct starpu_codelet task_red_cl =
+{
+	.cpu_funcs = { cl_cpu_task_red },
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+	.name = "task_red"
+};
+
+int main(int argc, char *argv[])
+{
+	int comm_rank, comm_size;
+	/* Initializes STarPU and the StarPU-MPI layer */
+	starpu_fxt_autostart_profiling(0);
+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
+
+	int nworkers = starpu_cpu_worker_get_count();
+	if (nworkers < 2)
+	{
+        	FPRINTF(stderr, "We need at least 2 CPU worker per node.\n");
+        	starpu_mpi_shutdown();
+       		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
+	if (comm_size < 2)
+	{
+        	FPRINTF(stderr, "We need at least 2 nodes.\n");
+        	starpu_mpi_shutdown();
+       		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+	double a, b[comm_size];
+	starpu_data_handle_t a_h, b_h[comm_size];
+	double work_coef = 2;
+	enum starpu_data_access_mode codelet_mode;
+	enum starpu_data_access_mode task_mode;
+	int i,j,work_node;
+    	starpu_mpi_tag_t tag = 0;
+	for (i = 0 ; i < 2 ; i++)
+	{
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (i==0)
+			task_mode = STARPU_MPI_REDUX;
+		else
+			task_mode = STARPU_REDUX;
+		if (comm_rank == 0)
+		{
+			a = 1.0;
+			printf("init a = %f\n", a);
+			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
+			for (j=0;j<comm_size;j++)
+				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+		}
+		else
+		{
+			b[comm_rank] = 1.0 / (comm_rank + 1.0);
+			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
+			for (j=0;j<comm_size;j++)
+			{
+				if (j == comm_rank)
+					starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
+				else
+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+			}
+		}
+		starpu_mpi_data_register(a_h, tag++, 0);
+		for (j=0;j<comm_size;j++)
+			starpu_mpi_data_register(b_h[j], tag++, j);
+
+		starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
+		starpu_fxt_start_profiling();
+		for (work_node=1; work_node < comm_size;work_node++)
+		{
+			for (j=1;j<=work_coef*nworkers;j++)
+			{
+				if (i == 0)
+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
+					&mpi_work_cl,
+					task_mode, a_h,
+					STARPU_R, b_h[work_node],
+					STARPU_EXECUTE_ON_NODE, work_node,
+					0);
+				else
+				    starpu_mpi_task_insert(MPI_COMM_WORLD,
+					&work_cl,
+					task_mode, a_h,
+					STARPU_R, b_h[work_node],
+					STARPU_EXECUTE_ON_NODE, work_node,
+					0);
+			}
+		}
+		starpu_mpi_redux_data(MPI_COMM_WORLD, a_h);
+		starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (comm_rank == 0)
+		{
+			double tmp = 0.0;
+			for (work_node = 1; work_node < comm_size ; work_node++)
+				tmp += 1.0 / (work_node + 1.0);
+			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
+		}
+		starpu_data_unregister(a_h);
+		for (work_node=0; work_node < comm_size;work_node++)
+			starpu_data_unregister(b_h[work_node]);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+	}
+	starpu_mpi_shutdown();
+	return 0;
+}

+ 253 - 0
mpi/examples/native_fortran/nf_mpi_redux.f90

@@ -0,0 +1,253 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program nf_mpi_redux
+  use iso_c_binding
+  use fstarpu_mod
+  use fstarpu_mpi_mod
+
+  implicit none
+
+  integer, target                         :: ret, np, i, j, trial
+  type(c_ptr)                             :: work_cl, task_rw_cl,task_red_cl, task_ini_cl
+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
+  real(kind(1.d0)), target                :: a,tmp
+  real(kind(1.d0)), target, allocatable   :: b(:)
+  integer(kind=8)                         :: tag, err
+  type(c_ptr)                             :: ahdl
+  type(c_ptr), target, allocatable        :: bhdl(:)
+  type(c_ptr)                             :: task_mode, codelet_mode
+  integer, target                         :: comm_world,comm_w_rank, comm_size
+  integer(c_int), target                  :: w_node, nworkers, work_coef
+
+  call fstarpu_fxt_autostart_profiling(0)
+  ret = fstarpu_init(c_null_ptr)
+  ret = fstarpu_mpi_init(1)
+
+  comm_world = fstarpu_mpi_world_comm()
+  comm_w_rank  = fstarpu_mpi_world_rank()
+  comm_size  = fstarpu_mpi_world_size()
+  if (comm_size.lt.2) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with at least two nodes.")')
+    stop 2
+  end if
+  allocate(b(comm_size-1), bhdl(comm_size-1))
+  nworkers = fstarpu_worker_get_count()
+  if (nworkers.lt.1) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with at least one worker per node.")')
+    stop 2
+  end if
+
+  ! allocate and reduction codelets
+  task_red_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_red_cl, namered)
+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
+
+  task_ini_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
+
+  work_coef=2
+
+  do trial=1,2
+
+  if (trial.eq.1) then
+        write(*,*) "Using STARPU_MPI_REDUX"
+        codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
+        task_mode = FSTARPU_MPI_REDUX
+  else if (trial.eq.2) then
+        write(*,*) "Using STARPU_REDUX"
+        codelet_mode = FSTARPU_REDUX
+        task_mode = FSTARPU_REDUX
+  end if
+  ! allocate and fill codelet structs
+  work_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(work_cl, name)
+  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
+  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
+  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
+  err = fstarpu_mpi_barrier(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    write(*,'(" ")')
+    a = 1.0
+    write(*,*) "init a = ", a
+  else
+    b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
+    write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank), " AT ", &
+c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
+  end if
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  tag = 0
+  if(comm_w_rank.eq.0) then
+    call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
+    do i=1,comm_size-1
+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+    end do
+  else
+    call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
+    do i=1,comm_size-1
+      if (i.eq.comm_w_rank) then
+        call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
+      else
+        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+      end if
+    end do
+  end if
+  call fstarpu_mpi_data_register(ahdl,  tag,  0)
+  do i=1,comm_size-1
+     call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
+  end do
+
+  tag = tag + comm_size
+
+  call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+
+  call fstarpu_fxt_start_profiling()
+  do w_node=1,comm_size-1
+    do i=1,work_coef*nworkers
+      call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
+             work_cl,                                         &
+             task_mode, ahdl,                            &
+             FSTARPU_R, bhdl(w_node),                      &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+    end do
+  end do
+  call fstarpu_mpi_redux_data(comm_world, ahdl)
+  err = fstarpu_mpi_wait_for_all(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    tmp = 0
+    do w_node=1,comm_size-1
+      tmp = tmp + 1.0 / (w_node+1.0)
+    end do
+    write(*,*) 'computed result ---> ',a, "expected =",&
+      1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
+  end if
+  err = fstarpu_mpi_barrier(comm_world)
+  call fstarpu_data_unregister(ahdl)
+  do w_node=1,comm_size-1
+    call fstarpu_data_unregister(bhdl(w_node))
+  end do
+  call fstarpu_codelet_free(work_cl)
+
+  end do
+
+  call fstarpu_fxt_stop_profiling()
+  call fstarpu_codelet_free(task_red_cl)
+  call fstarpu_codelet_free(task_ini_cl)
+
+
+  err = fstarpu_mpi_shutdown()
+  call fstarpu_shutdown()
+  deallocate(b, bhdl)
+  stop
+
+contains
+
+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer        :: comm_rank
+    integer, target :: i
+    real(kind(1.d0)), pointer :: a, b
+    real(kind(1.d0))          :: old_a
+
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
+    call nf_sleep(1.d0)
+    old_a = a
+    a = old_a + 3.0 + b
+    write(*,*) "task   (c_w_rank:",comm_rank," worker_id:",worker_id,") from ",old_a,"to",a
+
+    return
+  end subroutine cl_cpu_task
+
+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: as, ad
+    real(kind(1.d0))           :: old_ad
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
+    old_ad = ad
+    ad = ad + as
+    call nf_sleep(1.d0)
+    write(*,*) "red_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,")",as, old_ad, ' ---> ',ad
+
+    return
+  end subroutine cl_cpu_task_red
+
+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args
+        ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: a
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call nf_sleep(0.5d0)
+    ! As this codelet is run by each worker in the REDUX mode case
+    ! this initialization makes salient the number of copies spawned
+    write(*,*) "ini_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,") set to", comm_rank, "(was",a,")"
+    a = comm_rank
+    return
+  end subroutine cl_cpu_task_ini
+
+  subroutine nf_sleep(t)
+    implicit none
+    integer :: t_start, t_end, t_rate
+    real(kind(1.d0))     :: ta, t
+    call system_clock(t_start)
+    do
+       call system_clock(t_end, t_rate)
+       ta = real(t_end-t_start)/real(t_rate)
+       if(ta.gt.t) return
+    end do
+  end subroutine nf_sleep
+
+end program

+ 238 - 0
mpi/examples/native_fortran/nf_redux_test.f90

@@ -0,0 +1,238 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program main
+  use iso_c_binding
+  use fstarpu_mod
+  use fstarpu_mpi_mod
+
+  implicit none
+
+  integer, target                         :: ret, np, i, j
+  type(c_ptr)                             :: task_cl, task_rw_cl, task_red_cl, task_ini_cl
+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
+  real(kind(1.d0)), target                :: a1, a2, b1, b2
+  integer(kind=8)                          :: tag, err
+  type(c_ptr)                             :: a1hdl, a2hdl, b1hdl, b2hdl
+  integer, target                         :: comm, comm_world, comm_w_rank, comm_size
+  integer(c_int), target                  :: w_node
+
+  call fstarpu_fxt_autostart_profiling(0)
+  ret = fstarpu_init(c_null_ptr)
+  ret = fstarpu_mpi_init(1)
+
+  comm_world = fstarpu_mpi_world_comm()
+  comm_w_rank  = fstarpu_mpi_world_rank()
+  comm_size  = fstarpu_mpi_world_size()
+  if (comm_size.ne.4) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with 4 MPI")')
+    stop 1
+  end if
+  err   = fstarpu_mpi_barrier(comm_world)
+
+  if(comm_w_rank.eq.0) then
+    write(*,'(" ")')
+    a1 = 1.0
+    write(*,*) "init_a1", a1
+    b1 = 0.5
+    write(*,*) "init b1", b1
+  end if
+  if(comm_w_rank.eq.1) then
+    write(*,'(" ")')
+    a2 = 2.0
+    write(*,*) "init_a2", a2
+    b2 = 0.8
+    write(*,*) "init b2", b2
+  end if
+
+  ! allocate and fill codelet structs
+  task_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_cl, name)
+  call fstarpu_codelet_add_cpu_func(task_cl, C_FUNLOC(cl_cpu_task))
+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_REDUX)
+  call fstarpu_codelet_add_buffer(task_cl, FSTARPU_R)
+
+  ! allocate and reduction codelets
+  task_red_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_red_cl, namered)
+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
+
+  task_ini_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  tag = 0
+  if(comm_w_rank.eq.0) then
+        call fstarpu_variable_data_register(a1hdl, 0, c_loc(a1),c_sizeof(a1))
+        call fstarpu_variable_data_register(b1hdl, 0, c_loc(b1),c_sizeof(b1))
+  else
+        call fstarpu_variable_data_register(a1hdl, -1, c_null_ptr,c_sizeof(a1))
+        call fstarpu_variable_data_register(b1hdl, -1, c_null_ptr,c_sizeof(b1))
+  end if
+  call fstarpu_mpi_data_register(a1hdl,tag,0)
+  call fstarpu_mpi_data_register(b1hdl, tag+1,0)
+
+  tag = tag + 2
+  if(comm_w_rank.eq.1) then
+        call fstarpu_variable_data_register(a2hdl, 0, c_loc(a2),c_sizeof(a2))
+        call fstarpu_variable_data_register(b2hdl, 0, c_loc(b2),c_sizeof(b2))
+  else
+        call fstarpu_variable_data_register(a2hdl, -1, c_null_ptr,c_sizeof(a2))
+        call fstarpu_variable_data_register(b2hdl, -1, c_null_ptr,c_sizeof(b2))
+  end if
+  call fstarpu_mpi_data_register(a2hdl,tag,1)
+  call fstarpu_mpi_data_register(b2hdl, tag+1, 1)
+  tag = tag + 2
+
+  call fstarpu_data_set_reduction_methods(a1hdl, task_red_cl,task_ini_cl)
+  call fstarpu_data_set_reduction_methods(a2hdl, task_red_cl,task_ini_cl)
+
+  err = fstarpu_mpi_barrier(comm_world)
+
+  call fstarpu_fxt_start_profiling()
+
+  w_node = 3
+  comm = comm_world
+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
+             task_cl,                                         &
+             FSTARPU_REDUX, a1hdl,                            &
+             FSTARPU_R, b1hdl,                                &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+  w_node = 2
+  comm = comm_world
+  call fstarpu_mpi_task_insert( (/ c_loc(comm),   &
+             task_cl,                                         &
+             FSTARPU_REDUX, a2hdl,                            &
+             FSTARPU_R, b2hdl,                                &
+             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+             C_NULL_PTR /))
+
+  call fstarpu_mpi_redux_data(comm_world, a1hdl)
+  call fstarpu_mpi_redux_data(comm_world, a2hdl)
+  ! write(*,*) "waiting all tasks ..."
+  err = fstarpu_mpi_wait_for_all(comm_world)
+
+  if(comm_w_rank.eq.0) then
+     write(*,*) 'computed result ---> ',a1, "expected =",4.5
+  end if
+  if(comm_w_rank.eq.1) then
+     write(*,*) 'computed result ---> ',a2, "expected=",5.8
+  end if
+  call fstarpu_data_unregister(a1hdl)
+  call fstarpu_data_unregister(a2hdl)
+  call fstarpu_data_unregister(b1hdl)
+  call fstarpu_data_unregister(b2hdl)
+
+  call fstarpu_fxt_stop_profiling()
+  call fstarpu_codelet_free(task_cl)
+  call fstarpu_codelet_free(task_red_cl)
+  call fstarpu_codelet_free(task_ini_cl)
+
+
+  err = fstarpu_mpi_shutdown()
+  call fstarpu_shutdown()
+
+  stop
+
+contains
+
+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer        :: comm_rank
+    integer, target :: i
+    real(kind(1.d0)), pointer :: a, b
+    real(kind(1.d0))          :: old_a
+
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
+    call nf_sleep(1.d0)
+    old_a = a
+    a = 3.0 + b
+    write(*,*) "task   (c_w_rank:",comm_rank,") from ",old_a,"to",a
+
+    return
+  end subroutine cl_cpu_task
+
+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: as, ad
+    real(kind(1.d0))           :: old_ad
+
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
+    old_ad = ad
+    ad = ad + as
+    call nf_sleep(1.d0)
+    write(*,*) "red_cl (c_w_rank:",comm_rank,")",as, old_ad, ' ---> ',ad
+
+    return
+  end subroutine cl_cpu_task_red
+
+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args
+        ! cl_args is unused
+    integer(c_int) :: ret
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: a
+
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call nf_sleep(0.5d0)
+    a = 0.0
+    write(*,*) "ini_cl (c_w_rank:",comm_rank,")"
+    return
+  end subroutine cl_cpu_task_ini
+
+  subroutine nf_sleep(t)
+    implicit none
+    integer :: t_start, t_end, t_rate
+    real(kind(1.d0))     :: ta, t
+    call system_clock(t_start)
+    do
+       call system_clock(t_end, t_rate)
+       ta = real(t_end-t_start)/real(t_rate)
+       if(ta.gt.t) return
+    end do
+  end subroutine nf_sleep
+
+end program main

+ 9 - 0
mpi/include/starpu_mpi.h

@@ -232,6 +232,11 @@ int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, s
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 
 
 /**
 /**
+   Same of starpu_mpi_irecv_detached but with the \p prio parameter.
+*/
+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
+
+/**
    Post a nonblocking receive in \p data_handle from the node \p
    Post a nonblocking receive in \p data_handle from the node \p
    source using the message tag \p data_tag within the communicator \p
    source using the message tag \p data_tag within the communicator \p
    comm. On completion, the \p callback function is called with the
    comm. On completion, the \p callback function is called with the
@@ -561,6 +566,10 @@ int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
    Return the tag of the given data.
    Return the tag of the given data.
 */
 */
 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
+/**
+   Return the redux map of the given data.
+*/
+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t handle);
 
 
 /**
 /**
    Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()
    Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()

+ 0 - 1
mpi/src/mpi/starpu_mpi_early_data.h

@@ -40,7 +40,6 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 	  void *buffer;
 	  void *buffer;
 	  size_t size;
 	  size_t size;
 	  unsigned buffer_node;
 	  unsigned buffer_node;
-	  int req_ready;
 	  struct _starpu_mpi_node_tag node_tag;
 	  struct _starpu_mpi_node_tag node_tag;
 	  starpu_pthread_mutex_t req_mutex;
 	  starpu_pthread_mutex_t req_mutex;
 	  starpu_pthread_cond_t req_cond;
 	  starpu_pthread_cond_t req_cond;

+ 40 - 34
mpi/src/mpi/starpu_mpi_mpi.c

@@ -50,6 +50,9 @@ static unsigned nready_process;
 /* Number of send requests to submit to MPI at the same time */
 /* Number of send requests to submit to MPI at the same time */
 static unsigned ndetached_send;
 static unsigned ndetached_send;
 
 
+/* Force allocation of early data */
+static int early_data_force_allocate;
+
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 #endif
 #endif
@@ -81,6 +84,11 @@ static starpu_pthread_t progress_thread;
 #endif
 #endif
 static int running = 0;
 static int running = 0;
 
 
+/* Provides synchronization between an early request, a sync request, and an early data handle:
+ * we keep it held while checking and posting one to prevent the other.
+ * This is to be taken always before the progress_mutex. */
+static starpu_pthread_mutex_t early_data_mutex;
+
 /* Driver taken by StarPU-MPI to process tasks when there is no requests to
 /* Driver taken by StarPU-MPI to process tasks when there is no requests to
  * handle instead of polling endlessly */
  * handle instead of polling endlessly */
 static struct starpu_driver *mpi_driver = NULL;
 static struct starpu_driver *mpi_driver = NULL;
@@ -103,7 +111,7 @@ static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
 
 
-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio);
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 #pragma weak smpi_simulated_main_
 #pragma weak smpi_simulated_main_
@@ -182,8 +190,6 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 
 	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
 	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
-
 	if (req->request_type == RECV_REQ)
 	if (req->request_type == RECV_REQ)
 	{
 	{
 		/* Case : the request is the internal receive request submitted
 		/* Case : the request is the internal receive request submitted
@@ -206,6 +212,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 				req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 			}
 			}
 
 
+			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 					  req->datatype_name, (int)req->count, req->registered_datatype);
 					  req->datatype_name, (int)req->count, req->registered_datatype);
@@ -213,31 +220,24 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			_STARPU_MPI_INC_READY_REQUESTS(+1);
 			_STARPU_MPI_INC_READY_REQUESTS(+1);
 
 
 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
-			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->posted_mutex);
 			req->posted = 1;
 			req->posted = 1;
 			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
 			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->posted_mutex);
-			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		}
 		}
 		else
 		else
 		{
 		{
+			STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 
 
 			if (early_data_handle)
 			if (early_data_handle)
 			{
 			{
+				/* Got the early_data_handle */
+				STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
+
 				/* Case: a receive request for a data with the given tag and source has already been
 				/* Case: a receive request for a data with the given tag and source has already been
 				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
 				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
 				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
 				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
 				 * will be called to bring the data back to the original data handle associated to the request.*/
 				 * will be called to bring the data back to the original data handle associated to the request.*/
-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
-				while (!(early_data_handle->req_ready))
-					STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req_cond), &(early_data_handle->req_mutex));
-				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
-				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
-
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
 				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 
 
@@ -254,9 +254,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				cb_args->req = req;
 				cb_args->req = req;
 
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
-				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
 				// FIXME: when buffer == NULL, do not hardcode acquiring on early_data_handle->buffer_node, to just acquire where the data happens to have been stored by MPI
-				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL);
+				starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(early_data_handle->handle,early_data_handle->buffer_node,STARPU_R,NULL,_starpu_mpi_early_data_cb,(void*) cb_args,  1, 0, NULL, NULL, req->prio);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			}
 			else
 			else
@@ -265,6 +264,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				if (sync_req)
 				{
 				{
+					/* Got the sync req */
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 					/* Case: we already received the send envelope, we can proceed with the receive */
 					/* Case: we already received the send envelope, we can proceed with the receive */
 					req->sync = 1;
 					req->sync = 1;
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
@@ -279,6 +280,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 						STARPU_ASSERT(req->count);
 						STARPU_ASSERT(req->count);
 						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 						req->ptr = (void *)starpu_malloc_on_node_flags(req->node, req->count, 0);
 					}
 					}
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
 					/* Throw away the dumb request that was only used to know that we got the envelope */
 					/* Throw away the dumb request that was only used to know that we got the envelope */
@@ -288,13 +290,17 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				{
 				{
 					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					_starpu_mpi_early_request_enqueue(req);
 					_starpu_mpi_early_request_enqueue(req);
+					/* We have queued our early request, we can let the progression thread look at it */
+					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 				}
 				}
 			}
 			}
 		}
 		}
 	}
 	}
 	else
 	else
 	{
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		if (req->request_type == SEND_REQ)
 		if (req->request_type == SEND_REQ)
 			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
 			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
 		else
 		else
@@ -1157,13 +1163,11 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	_starpu_mpi_early_data_add(early_data_handle);
 	_starpu_mpi_early_data_add(early_data_handle);
 
 
 	starpu_data_handle_t data_handle;
 	starpu_data_handle_t data_handle;
-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
 	data_handle = _starpu_mpi_tag_get_data_handle_from_tag(envelope->data_tag);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
 
 	// TODO: rather select some memory node next to the NIC
 	// TODO: rather select some memory node next to the NIC
 	unsigned buffer_node = STARPU_MAIN_RAM;
 	unsigned buffer_node = STARPU_MAIN_RAM;
-	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID)
+	if (data_handle && starpu_data_get_interface_id(data_handle) < STARPU_MAX_INTERFACE_ID && !early_data_force_allocate)
 	{
 	{
 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
 		/* We know which data will receive it and we won't have to unpack, use just the same kind of data.  */
 		early_data_handle->buffer = NULL;
 		early_data_handle->buffer = NULL;
@@ -1190,25 +1194,16 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
-							  NULL, NULL, 1, 1, envelope->size);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+							  NULL, NULL, 1, 1, envelope->size, STARPU_DEFAULT_PRIO);
+	/* The early data handle is ready, we can let _starpu_mpi_submit_ready_request
+	 * proceed with acquiring it */
+	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 
 
+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 	// We wait until the request is pushed in the
 	// We wait until the request is pushed in the
 	// ready_request list
 	// ready_request list
-	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->backend->posted_mutex));
 	while (!(early_data_handle->req->posted))
 	while (!(early_data_handle->req->posted))
-		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &(early_data_handle->req->backend->posted_mutex));
-	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->backend->posted_mutex));
-
-#ifdef STARPU_DEVEL
-#warning check if req_ready is still necessary
-#endif
-	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
-	early_data_handle->req_ready = 1;
-	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
-	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &progress_mutex);
 
 
 	// Handle the request immediatly to make sure the mpi_irecv is
 	// Handle the request immediatly to make sure the mpi_irecv is
 	// posted before receiving an other envelope
 	// posted before receiving an other envelope
@@ -1421,6 +1416,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 				{
 					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
 					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
 
 
+					STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&early_data_mutex);
+					STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
 
 
 					/* Case: a data will arrive before a matching receive is
 					/* Case: a data will arrive before a matching receive is
@@ -1453,9 +1451,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 							new_req->backend->is_internal_req = 0; // ????
 							new_req->backend->is_internal_req = 0; // ????
 							new_req->count = envelope->size;
 							new_req->count = envelope->size;
 							_starpu_mpi_sync_data_add(new_req);
 							_starpu_mpi_sync_data_add(new_req);
+							/* We have queued our sync request, we can let _starpu_mpi_submit_ready_request find it */
+							STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 						}
 						}
 						else
 						else
 						{
 						{
+							/* This will release early_data_mutex when appropriate */
 							_starpu_mpi_receive_early_data(envelope, envelope_status, envelope_comm);
 							_starpu_mpi_receive_early_data(envelope, envelope_status, envelope_comm);
 						}
 						}
 					}
 					}
@@ -1466,6 +1467,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * _starpu_mpi_handle_ready_request. */
 					 * _starpu_mpi_handle_ready_request. */
 					else
 					else
 					{
 					{
+						/* Got the early request */
+						STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_mutex);
 						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
 						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
 
 
@@ -1621,6 +1624,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 {
 {
         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
+        STARPU_PTHREAD_MUTEX_INIT(&early_data_mutex, NULL);
         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
 	_starpu_mpi_req_list_init(&ready_recv_requests);
 	_starpu_mpi_req_list_init(&ready_recv_requests);
@@ -1634,6 +1638,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
 
 	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
 	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
 	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
 	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
+	early_data_force_allocate = starpu_get_env_number_default("STARPU_MPI_EARLYDATA_ALLOCATE", 0);
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
@@ -1688,6 +1693,7 @@ void _starpu_mpi_progress_shutdown(void **value)
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_ready_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_ready_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
+        STARPU_PTHREAD_MUTEX_DESTROY(&early_data_mutex);
         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
 }
 }
 
 

+ 0 - 2
mpi/src/mpi/starpu_mpi_mpi_backend.c

@@ -54,7 +54,6 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 
 
 	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->posted_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
 	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
 
 
 	//req->backend->other_request = NULL;
 	//req->backend->other_request = NULL;
@@ -80,7 +79,6 @@ void _starpu_mpi_mpi_backend_request_destroy(struct _starpu_mpi_req *req)
 {
 {
 	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
 	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
-	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->posted_mutex);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
 	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
 	free(req->backend);
 	free(req->backend);
 	req->backend = NULL;
 	req->backend = NULL;

+ 0 - 1
mpi/src/mpi/starpu_mpi_mpi_backend.h

@@ -54,7 +54,6 @@ struct _starpu_mpi_req_backend
 
 
 	starpu_pthread_mutex_t req_mutex;
 	starpu_pthread_mutex_t req_mutex;
 	starpu_pthread_cond_t req_cond;
 	starpu_pthread_cond_t req_cond;
-	starpu_pthread_mutex_t posted_mutex;
 	starpu_pthread_cond_t posted_cond;
 	starpu_pthread_cond_t posted_cond;
 	/** In the case of a Wait/Test request, we are going to post a request
 	/** In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */
 	 * to test the completion of another request */

+ 28 - 9
mpi/src/starpu_mpi.c

@@ -161,12 +161,12 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 
 
 	if (sequential_consistency)
 	if (sequential_consistency)
 	{
 	{
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
 	}
 	}
 	else
 	else
 	{
 	{
 		/* post_sync_job_id has already been filled */
 		/* post_sync_job_id has already been filled */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
 	}
 	}
 }
 }
 
 
@@ -289,7 +289,7 @@ int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starp
 	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
 	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
 }
 }
 
 
-struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
+struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
 {
 {
 	if (_starpu_mpi_fake_world_size != -1)
 	if (_starpu_mpi_fake_world_size != -1)
 	{
 	{
@@ -297,7 +297,7 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
 	_starpu_mpi_req_willpost(req);
 	_starpu_mpi_req_willpost(req);
 
 
 	if (sequential_consistency == 0)
 	if (sequential_consistency == 0)
@@ -317,7 +317,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
 
 	struct _starpu_mpi_req *req;
 	struct _starpu_mpi_req *req;
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
-	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
+	req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
 	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
 
 
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
@@ -331,7 +331,17 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, star
 {
 {
 	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_LOG_IN();
 
 
-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+	_STARPU_MPI_LOG_IN();
+
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
+
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
@@ -340,7 +350,7 @@ int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_h
 {
 {
 	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_LOG_IN();
 
 
-	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
+	_starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
 
 
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 	return 0;
@@ -379,10 +389,13 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
 
 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 {
 {
+	struct _starpu_mpi_data *data = data_handle->mpi_data;
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
-	_starpu_spin_destroy(&((struct _starpu_mpi_data*) data_handle->mpi_data)->coop_lock);
-	free(data_handle->mpi_data);
+	_starpu_spin_destroy(&data->coop_lock);
+	if (data->redux_map != REDUX_CONTRIB)
+		free(data->redux_map);
+	free(data);
 	data_handle->mpi_data = NULL;
 	data_handle->mpi_data = NULL;
 }
 }
 
 
@@ -448,6 +461,12 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
 	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
 }
 }
 
 
+char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
+{
+	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
+}
+
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 {
 {
 	int me, rank;
 	int me, rank;

+ 1 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -297,8 +297,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 
 
 	if (first)
 	if (first)
 		/* We were first, we are responsible for acquiring the data for everybody */
 		/* We were first, we are responsible for acquiring the data for everybody */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL);
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, -1, mode, _starpu_mpi_coop_send_acquired_callback, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &coop_sends->pre_sync_jobid, NULL, req->prio);
 	else
 	else
 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
 		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
 }
 }
-

+ 16 - 6
mpi/src/starpu_mpi_private.h

@@ -118,7 +118,7 @@ int _starpu_debug_rank;
 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
 			fflush(stderr);	\
 			fflush(stderr);	\
 		} \
 		} \
-	} while(0);
+	} while(0)
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
@@ -130,7 +130,7 @@ int _starpu_debug_rank;
 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
 			fflush(stderr); \
 			fflush(stderr); \
 		} \
 		} \
-	} while(0);
+	} while(0)
 #else
 #else
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
 #  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
@@ -141,10 +141,10 @@ int _starpu_debug_rank;
 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
 	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
 	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
                                              fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
                                              fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
-                                             fflush(stderr); }} while(0);
+                                             fflush(stderr); }} while(0)
 #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
 #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
-                                             fflush(stderr); } while(0);
+                                             fflush(stderr); } while(0)
 
 
 #ifdef STARPU_MPI_EXTRA_VERBOSE
 #ifdef STARPU_MPI_EXTRA_VERBOSE
 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
@@ -203,6 +203,12 @@ struct _starpu_mpi_coop_sends
 	long pre_sync_jobid;
 	long pre_sync_jobid;
 };
 };
 
 
+/** cf. redux_map field : this is the value
+ * put in this field whenever a node contributes
+ * to the reduction of the data.
+ * Only the owning node keeps track of all the contributing nodes. */
+#define REDUX_CONTRIB ((char*) -1)
+
 /** Initialized in starpu_mpi_data_register_comm */
 /** Initialized in starpu_mpi_data_register_comm */
 struct _starpu_mpi_data
 struct _starpu_mpi_data
 {
 {
@@ -211,8 +217,12 @@ struct _starpu_mpi_data
 	char *cache_sent;
 	char *cache_sent;
 	int cache_received;
 	int cache_received;
 
 
-	/** Rendez-vous data for opportunistic cooperative sends */
-	/** Needed to synchronize between submit thread and workers */
+	/** Array used to store the contributing nodes to this data
+	  * when it is accessed in REDUX mode. */
+	char* redux_map;
+
+	/** Rendez-vous data for opportunistic cooperative sends,
+	  * Needed to synchronize between submit thread and workers */
 	struct _starpu_spinlock coop_lock;
 	struct _starpu_spinlock coop_lock;
 	/** Current cooperative send bag */
 	/** Current cooperative send bag */
 	struct _starpu_mpi_coop_sends *coop_sends;
 	struct _starpu_mpi_coop_sends *coop_sends;

+ 55 - 20
mpi/src/starpu_mpi_task_insert.c

@@ -100,7 +100,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 	{
 	{
 		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
 		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
 	}
 	}
-	if (data && mode & STARPU_R)
+	if (data && mode & STARPU_R && !(mode & STARPU_MPI_REDUX))
 	{
 	{
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
@@ -118,7 +118,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 				if (data_tag == -1)
 				if (data_tag == -1)
 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
-				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
+				starpu_mpi_irecv_detached_prio(data, mpi_rank, data_tag, prio, comm, NULL, NULL);
 			}
 			}
 			// else the node has already received the data
 			// else the node has already received the data
 		}
 		}
@@ -142,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 static
 static
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
 void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
 {
 {
-	if (mode & STARPU_W)
+	if (mode & STARPU_W && !(mode & STARPU_MPI_REDUX))
 	{
 	{
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		int mpi_rank = starpu_mpi_data_get_rank(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
 		starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(data);
@@ -179,7 +179,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 {
 {
 	if (_starpu_cache_enabled)
 	if (_starpu_cache_enabled)
 	{
 	{
-		if (mode & STARPU_W || mode & STARPU_REDUX)
+		if ((mode & STARPU_W && !(mode & STARPU_MPI_REDUX)) || mode & STARPU_REDUX)
 		{
 		{
 			/* The data has been modified, it MUST be removed from the cache */
 			/* The data has been modified, it MUST be removed from the cache */
 			starpu_mpi_cached_send_clear(data);
 			starpu_mpi_cached_send_clear(data);
@@ -189,7 +189,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 	else
 	else
 	{
 	{
 		/* We allocated a temporary buffer for the received data, now drop it */
 		/* We allocated a temporary buffer for the received data, now drop it */
-		if ((mode & STARPU_R) && do_execute)
+		if ((mode & STARPU_R && !(mode & STARPU_MPI_REDUX)) && do_execute)
 		{
 		{
 			int mpi_rank = starpu_mpi_data_get_rank(data);
 			int mpi_rank = starpu_mpi_data_get_rank(data);
 			if (mpi_rank == STARPU_MPI_PER_NODE)
 			if (mpi_rank == STARPU_MPI_PER_NODE)
@@ -254,7 +254,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 				inconsistent_execute = 0;
 				inconsistent_execute = 0;
 			}
 			}
 		}
 		}
-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
 		{
 		{
 			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
 			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
 			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
 			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
@@ -617,6 +617,20 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
 
 	for(i=0 ; i<nb_data ; i++)
 	for(i=0 ; i<nb_data ; i++)
 	{
 	{
+		if ((descrs[i].mode & STARPU_REDUX || descrs[i].mode & STARPU_MPI_REDUX) && descrs[i].handle)
+		{
+			struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
+			if (me == starpu_mpi_data_get_rank(descrs[i].handle))
+			{
+				int size;
+				starpu_mpi_comm_size(comm, &size);
+				if (mpi_data->redux_map == NULL)
+					_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
+				mpi_data->redux_map [xrank] = 1;
+			}
+			else if (me == xrank)
+				mpi_data->redux_map = REDUX_CONTRIB;
+		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
 	}
 	}
@@ -813,6 +827,11 @@ void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args,
 
 
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
  * a data previously accessed in REDUX mode gets accessed in R mode. */
  * a data previously accessed in REDUX mode gets accessed in R mode. */
+/* FIXME: In order to prevent simultaneous receive submissions
+ * on the same handle, we need to wait that all the starpu_mpi
+ * tasks are done before submitting next tasks. The current
+ * version of the implementation does not support multiple
+ * simultaneous receive requests on the same handle.*/
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 {
 {
 	int me, rank, nb_nodes;
 	int me, rank, nb_nodes;
@@ -820,6 +839,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	rank = starpu_mpi_data_get_rank(data_handle);
 	data_tag = starpu_mpi_data_get_tag(data_handle);
 	data_tag = starpu_mpi_data_get_tag(data_handle);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	if (rank == -1)
 	if (rank == -1)
 	{
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
@@ -832,12 +852,16 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
 
-	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
-
+	_STARPU_MPI_DEBUG(50, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
 	// need to count how many nodes have the data in redux mode
 	// need to count how many nodes have the data in redux mode
 	if (me == rank)
 	if (me == rank)
 	{
 	{
-		int i;
+		int i,j;
+		_STARPU_MPI_DEBUG(50, "Who is in the map ?\n");
+		for (j = 0; j<nb_nodes; j++)
+		{
+			_STARPU_MPI_DEBUG(50, "%d is in the map ? %d\n", j, mpi_data->redux_map[j]);
+		}
 
 
 		// taskC depends on all taskBs created
 		// taskC depends on all taskBs created
 		// Creating synchronization task and use its jobid for tracing
 		// Creating synchronization task and use its jobid for tracing
@@ -848,8 +872,9 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 
 
 		for(i=0 ; i<nb_nodes ; i++)
 		for(i=0 ; i<nb_nodes ; i++)
 		{
 		{
-			if (i != rank)
+			if (i != rank && mpi_data->redux_map[i])
 			{
 			{
+				_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p \n", i, data_handle);
 				/* We need to make sure all is
 				/* We need to make sure all is
 				 * executed after data_handle finished
 				 * executed after data_handle finished
 				 * its last read access, we hence do
 				 * its last read access, we hence do
@@ -893,24 +918,34 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
 						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
 						   0);
 						   0);
 			}
 			}
+			else
+			{
+				_STARPU_MPI_DEBUG(5, "%d is not in the map or is me\n", i);
+			}
 		}
 		}
 
 
 		int ret = starpu_task_submit(taskC);
 		int ret = starpu_task_submit(taskC);
 		STARPU_ASSERT(ret == 0);
 		STARPU_ASSERT(ret == 0);
 	}
 	}
-	else
+	else if (mpi_data->redux_map)
 	{
 	{
-		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
+		STARPU_ASSERT(mpi_data->redux_map == REDUX_CONTRIB);
+		_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", rank);
 		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
 		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
-		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
+		starpu_data_invalidate_submit(data_handle);
 	}
 	}
-	/* FIXME: In order to prevent simultaneous receive submissions
-	 * on the same handle, we need to wait that all the starpu_mpi
-	 * tasks are done before submitting next tasks. The current
-	 * version of the implementation does not support multiple
-	 * simultaneous receive requests on the same handle.*/
-	starpu_task_wait_for_all();
-
+	else
+	{
+		_STARPU_MPI_DEBUG(5, "I am not in the map of %d, I am %d ...\n", rank, me);
+	}
+	if (mpi_data->redux_map != NULL)
+	{
+		_STARPU_MPI_DEBUG(100, "waiting for redux tasks with %d\n", rank);
+		starpu_task_wait_for_all();
+	}
+	if (me == rank)
+		free(mpi_data->redux_map);
+	mpi_data->redux_map = NULL;
 }
 }
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 {
 {

+ 1 - 1
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -74,7 +74,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 				inconsistent_execute = 0;
 				inconsistent_execute = 0;
 			}
 			}
 		}
 		}
-		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX || arg_type & STARPU_MPI_REDUX)
 		{
 		{
 			arg_i++;
 			arg_i++;
 			starpu_data_handle_t data = arglist[arg_i];
 			starpu_data_handle_t data = arglist[arg_i];

+ 1 - 1
mpi/tests/mpi_reduction.c

@@ -37,7 +37,7 @@ static struct starpu_codelet init_codelet =
 static struct starpu_codelet redux_codelet =
 static struct starpu_codelet redux_codelet =
 {
 {
 	.cpu_funcs = {redux_cpu_func},
 	.cpu_funcs = {redux_cpu_func},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	.model = &starpu_perfmodel_nop,
 	.model = &starpu_perfmodel_nop,

+ 3 - 0
mpi/tests/mpi_redux.c

@@ -14,6 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+/* This test does a manual reduction: all ranks send a number to the rank 0,
+ * the rank 0 sums these numbers and sends back the result to all ranks. */
+
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
 
 

+ 20 - 20
src/common/fxt.h

@@ -342,7 +342,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE1STR
 #ifdef FUT_FULL_PROBE1STR
@@ -356,7 +356,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
 	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE2STR
 #ifdef FUT_ALWAYS_PROBE2STR
@@ -377,7 +377,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE2STR
 #ifdef FUT_FULL_PROBE2STR
@@ -388,7 +388,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
 	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE3STR
 #ifdef FUT_ALWAYS_PROBE3STR
@@ -410,7 +410,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE3STR
 #ifdef FUT_FULL_PROBE3STR
@@ -421,7 +421,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
 	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE4STR
 #ifdef FUT_ALWAYS_PROBE4STR
@@ -444,7 +444,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE4STR
 #ifdef FUT_FULL_PROBE4STR
@@ -455,7 +455,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
 	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE5STR
 #ifdef FUT_ALWAYS_PROBE5STR
@@ -479,7 +479,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE5STR
 #ifdef FUT_FULL_PROBE5STR
@@ -490,7 +490,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
 	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE6STR
 #ifdef FUT_ALWAYS_PROBE6STR
@@ -515,7 +515,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE6STR
 #ifdef FUT_FULL_PROBE6STR
@@ -526,7 +526,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
 	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_ALWAYS_PROBE7STR
 #ifdef FUT_ALWAYS_PROBE7STR
@@ -552,7 +552,7 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
 	_STARPU_FUT_COMMIT(total_len);					\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifdef FUT_FULL_PROBE7STR
 #ifdef FUT_FULL_PROBE7STR
@@ -563,7 +563,7 @@ do {									\
     if(KEYMASK & fut_active) {						\
     if(KEYMASK & fut_active) {						\
 	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
 	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
     }									\
     }									\
-} while (0);
+} while (0)
 #endif
 #endif
 
 
 #ifndef FUT_RAW_PROBE7
 #ifndef FUT_RAW_PROBE7
@@ -787,7 +787,7 @@ do {									\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
 		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
 	}								\
-} while(0);
+} while(0)
 
 
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)			\
 do {									\
 do {									\
@@ -796,7 +796,7 @@ do {									\
 	char _archname[32]=""; \
 	char _archname[32]=""; \
 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
 	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
 	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
-} while(0);
+} while(0)
 
 
 #define _STARPU_TRACE_START_EXECUTING()				\
 #define _STARPU_TRACE_START_EXECUTING()				\
 	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
 	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
@@ -898,7 +898,7 @@ do {										\
 	else {									\
 	else {									\
 		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 	}									\
 	}									\
-} while(0);
+} while(0)
 
 
 #define _STARPU_TRACE_DATA_NAME(handle, name) \
 #define _STARPU_TRACE_DATA_NAME(handle, name) \
 	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_DATA_NAME, handle, name)
 	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_DATA_NAME, handle, name)
@@ -1319,8 +1319,8 @@ do {										\
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
        FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
        FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
 
 
-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre)          \
-       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req)          \
+       FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre, req)
 
 
 
 
 #else // !STARPU_USE_FXT
 #else // !STARPU_USE_FXT
@@ -1451,7 +1451,7 @@ do {										\
 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
-#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre);} while(0)
+#define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre, req) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre); (void)(req); } while(0)
 #define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
 #define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
 
 
 #endif // STARPU_USE_FXT
 #endif // STARPU_USE_FXT

+ 5 - 0
src/common/hash.c

@@ -46,6 +46,11 @@ uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 	return crc;
 	return crc;
 }
 }
 
 
+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc)
+{
+	return starpu_hash_crc32c_be_n(&input, sizeof(input), inputcrc);
+}
+
 uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
 uint32_t starpu_hash_crc32c_be(uint32_t input, uint32_t inputcrc)
 {
 {
 	uint8_t *p = (uint8_t *)&input;
 	uint8_t *p = (uint8_t *)&input;

+ 4 - 4
src/common/uthash.h

@@ -104,12 +104,12 @@ do {
   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
-} while (0);
+} while (0)
 
 
 #define HASH_BLOOM_FREE(tbl)                                                     \
 #define HASH_BLOOM_FREE(tbl)                                                     \
 do {                                                                             \
 do {                                                                             \
   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
-} while (0);
+} while (0)
 
 
 #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
 #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
 #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
 #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
@@ -368,7 +368,7 @@ do {
   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
       hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
       hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
   bkt = hashv & (num_bkts-1);                                                    \
   bkt = hashv & (num_bkts-1);                                                    \
-} while(0);
+} while(0)
  
  
 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
 #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
 do {                                                                             \
 do {                                                                             \
@@ -507,7 +507,7 @@ do {
     hashv ^= hashv << 25;                                                        \
     hashv ^= hashv << 25;                                                        \
     hashv += hashv >> 6;                                                         \
     hashv += hashv >> 6;                                                         \
     bkt = hashv & (num_bkts-1);                                                  \
     bkt = hashv & (num_bkts-1);                                                  \
-} while(0);
+} while(0)
 
 
 #ifdef HASH_USING_NO_STRICT_ALIASING
 #ifdef HASH_USING_NO_STRICT_ALIASING
 /* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.
 /* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.

+ 1 - 1
src/core/dependencies/data_arbiter_concurrency.c

@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 		{
 			cpt++;
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
 			_starpu_spin_lock(&handle->header_lock);

+ 2 - 2
src/core/dependencies/data_concurrency.c

@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 		{
 			cpt++;
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
 			_starpu_spin_lock(&handle->header_lock);
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 		{
 			cpt++;
 			cpt++;
-			_starpu_datawizard_progress(0);
+			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
 			_starpu_spin_lock(&handle->header_lock);

+ 6 - 2
src/core/dependencies/implicit_data_deps.c

@@ -225,8 +225,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 
 
-		if (mode & STARPU_R)
-			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+		if (mode & STARPU_R && !handle->initialized)
+		{
+			STARPU_ASSERT_MSG(handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+			/* The task will initialize it with init_cl */
+			handle->initialized = 1;
+		}
 
 
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		{
 		{

+ 1 - 2
src/core/jobs.c

@@ -288,8 +288,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	{
 	{
 		unsigned long jobs = STARPU_ATOMIC_ADDL(&njobs_finished, 1);
 		unsigned long jobs = STARPU_ATOMIC_ADDL(&njobs_finished, 1);
 
 
-		printf("\r%lu tasks finished...", jobs);
-		fflush(stdout);
+		fprintf(stderr,"\r%lu tasks finished (last %lu %p)...", jobs, j->job_id, j->task);
 	}
 	}
 
 
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;

+ 18 - 11
src/core/perfmodel/energy_model.c

@@ -43,7 +43,7 @@
 #endif
 #endif
 #endif
 #endif
 
 
-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
+#define ERROR_RETURN(retval, function) do { PAPI_perror(function); fprintf(stderr, "Error %d %s:line %d\n", retval,__FILE__,__LINE__);  return(retval); } while (0)
 
 
 #if 0
 #if 0
 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
@@ -52,6 +52,7 @@
 #endif
 #endif
 
 
 #ifdef STARPU_PAPI
 #ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
 static const int N_EVTS = 2;
 static const int N_EVTS = 2;
 
 
 static int nsockets;
 static int nsockets;
@@ -68,7 +69,7 @@ static int add_event(int EventSet, int socket);
 
 
 /*must be initialized to PAPI_NULL before calling PAPI_create_event*/
 /*must be initialized to PAPI_NULL before calling PAPI_create_event*/
 static int EventSet = PAPI_NULL;
 static int EventSet = PAPI_NULL;
-
+#endif
 #endif
 #endif
 
 
 static double t1;
 static double t1;
@@ -80,7 +81,7 @@ static nvmlDevice_t device;
 #endif
 #endif
 #endif
 #endif
 
 
-int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
+int starpu_energy_start(int workerid STARPU_ATTRIBUTE_UNUSED, enum starpu_worker_archtype archi)
 {
 {
 	t1 = starpu_timing_now();
 	t1 = starpu_timing_now();
 
 
@@ -100,11 +101,11 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
 		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
 
 
 		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
 		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_library_init");
 
 
 		/* Creating the eventset */
 		/* Creating the eventset */
 		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
 		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_create_eventset");
 
 
 		int i;
 		int i;
 		for (i = 0 ; i < nsockets ; i ++ )
 		for (i = 0 ; i < nsockets ; i ++ )
@@ -112,19 +113,25 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 			/* return the index of socket */
 			/* return the index of socket */
 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
 			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
 			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
 			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
-				ERROR_RETURN(retval);
+			{
+				if (retval == PAPI_EPERM)
+					_STARPU_DISP("PAPI could not access counters due to permissions errors. Perhaps your system requires to run measurements as root?\n");
+				else if (retval == PAPI_ENOEVNT)
+					_STARPU_DISP("PAPI could not access counters. Perhaps your system requires to run measurements as root?\n");
+				ERROR_RETURN(retval, "PAPI_add_named_event");
+			}
 		}
 		}
 
 
 		/* get the number of events in the event set */
 		/* get the number of events in the event set */
 		number = 0;
 		number = 0;
 		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
 		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_list_events");
 
 
 		debug("There are %d events in the event set\n", number);
 		debug("There are %d events in the event set\n", number);
 
 
 		/* Start counting */
 		/* Start counting */
 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_start");
 
 
 		return retval;
 		return retval;
 	}
 	}
@@ -180,7 +187,7 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
 
 		/* Stop counting and store the values into the array */
 		/* Stop counting and store the values into the array */
 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_stop");
 
 
 		int k,s;
 		int k,s;
 
 
@@ -199,11 +206,11 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
 
 		/*removes all events from a PAPI event set */
 		/*removes all events from a PAPI event set */
 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_cleanup_eventset");
 
 
 		/*deallocates the memory associated with an empty PAPI EventSet*/
 		/*deallocates the memory associated with an empty PAPI EventSet*/
 		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
 		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
-			ERROR_RETURN(retval);
+			ERROR_RETURN(retval, "PAPI_destroy_eventset");
 
 
 		break;
 		break;
 	}
 	}

+ 8 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -1328,7 +1328,7 @@ static void write_bus_latency_file_content(void)
 
 
 	_STARPU_DEBUG("writing latencies to %s\n", path);
 	_STARPU_DEBUG("writing latencies to %s\n", path);
 
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	if (!f)
 	if (!f)
 	{
 	{
 		perror("fopen write_bus_latency_file_content");
 		perror("fopen write_bus_latency_file_content");
@@ -1337,6 +1337,7 @@ static void write_bus_latency_file_content(void)
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
 	locked = _starpu_fwrlock(f) == 0;
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	_starpu_fftruncate(f, 0);
 
 
 	fprintf(f, "# ");
 	fprintf(f, "# ");
@@ -1684,10 +1685,11 @@ static void write_bus_bandwidth_file_content(void)
 
 
 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
 
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 
 
 	locked = _starpu_fwrlock(f) == 0;
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	_starpu_fftruncate(f, 0);
 
 
 	fprintf(f, "# ");
 	fprintf(f, "# ");
@@ -2124,9 +2126,10 @@ static void write_bus_config_file_content(void)
 
 
 	_STARPU_DEBUG("writing config to %s\n", path);
 	_STARPU_DEBUG("writing config to %s\n", path);
 
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 	locked = _starpu_fwrlock(f) == 0;
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	_starpu_fftruncate(f, 0);
 
 
 	fprintf(f, "# Current configuration\n");
 	fprintf(f, "# Current configuration\n");
@@ -2655,7 +2658,7 @@ static void write_bus_platform_file_content(int version)
 
 
 	_STARPU_DEBUG("writing platform to %s\n", path);
 	_STARPU_DEBUG("writing platform to %s\n", path);
 
 
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	if (!f)
 	if (!f)
 	{
 	{
 		perror("fopen write_bus_platform_file_content");
 		perror("fopen write_bus_platform_file_content");
@@ -2664,6 +2667,7 @@ static void write_bus_platform_file_content(int version)
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
 	locked = _starpu_fwrlock(f) == 0;
 	locked = _starpu_fwrlock(f) == 0;
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	_starpu_fftruncate(f, 0);
 
 
 	fprintf(f,
 	fprintf(f,

+ 4 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1177,11 +1177,12 @@ void starpu_save_history_based_model(struct starpu_perfmodel *model)
 
 
 	/* overwrite existing file, or create it */
 	/* overwrite existing file, or create it */
 	FILE *f;
 	FILE *f;
-	f = fopen(path, "w+");
+	f = fopen(path, "a+");
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
 
 
 	locked = _starpu_fwrlock(f) == 0;
 	locked = _starpu_fwrlock(f) == 0;
 	check_model(model);
 	check_model(model);
+	fseek(f, 0, SEEK_SET);
 	_starpu_fftruncate(f, 0);
 	_starpu_fftruncate(f, 0);
 	dump_model_file(f, model);
 	dump_model_file(f, model);
 	if (locked)
 	if (locked)
@@ -1610,10 +1611,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 	}
 	}
 
 
 	regmodel = &model->state->per_arch[comb][nimpl].regression;
 	regmodel = &model->state->per_arch[comb][nimpl].regression;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 
 
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 
 
 docal:
 docal:
 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
@@ -1654,8 +1655,8 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
 
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	{
 	{
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
 	}
 	}
 	else
 	else
 	{
 	{

+ 1 - 20
src/core/sched_policy.c

@@ -206,7 +206,7 @@ struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_c
 	if (selected_policy)
 	if (selected_policy)
 		return selected_policy;
 		return selected_policy;
 
 
-	/* If no policy was specified, we use the eager policy by default */
+	/* If no policy was specified, we use the lws policy by default */
 	return &_starpu_sched_lws_policy;
 	return &_starpu_sched_lws_policy;
 }
 }
 
 
@@ -1153,25 +1153,6 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	}
 	}
 }
 }
 
 
-void _starpu_wait_on_sched_event(void)
-{
-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
-
-	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
-
-	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
-
-	if (_starpu_machine_is_running())
-	{
-#ifndef STARPU_NON_BLOCKING_DRIVERS
-		STARPU_PTHREAD_COND_WAIT(&worker->sched_cond,
-					  &worker->sched_mutex);
-#endif
-	}
-
-	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
-}
-
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back STARPU_ATTRIBUTE_UNUSED)
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);

+ 0 - 2
src/core/sched_policy.h

@@ -63,8 +63,6 @@ struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 int _starpu_pop_task_end(struct starpu_task *task);
 int _starpu_pop_task_end(struct starpu_task *task);
 
 
-void _starpu_wait_on_sched_event(void);
-
 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 						   unsigned int node) STARPU_ATTRIBUTE_MALLOC;
 						   unsigned int node) STARPU_ATTRIBUTE_MALLOC;
 
 

+ 10 - 0
src/core/workers.c

@@ -1168,6 +1168,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 
 	/* Do not start performance counter collection by default */
 	/* Do not start performance counter collection by default */
 	conf->start_perf_counter_collection = 0;
 	conf->start_perf_counter_collection = 0;
+
+	conf->cuda_only_fast_alloc_other_memnodes = starpu_get_env_number_default("STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES", 0);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1531,6 +1533,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		_STARPU_DISP("Warning: STARPU_ENABLE_STATS is enabled, which slows down a bit\n");
 		_STARPU_DISP("Warning: STARPU_ENABLE_STATS is enabled, which slows down a bit\n");
 	}
 	}
 
 
+#ifndef STARPU_SIMGRID
+	if (starpu_get_env_number_default("STARPU_SIMGRID", 0))
+	{
+		_STARPU_DISP("Simulation mode requested, but this libstarpu was built without simgrid support, please recompile\n");
+		return -EINVAL;
+	}
+#endif
+
 #if defined(_WIN32) && !defined(__CYGWIN__)
 #if defined(_WIN32) && !defined(__CYGWIN__)
 	WSADATA wsadata;
 	WSADATA wsadata;
 	WSAStartup(MAKEWORD(1,0), &wsadata);
 	WSAStartup(MAKEWORD(1,0), &wsadata);

+ 78 - 61
src/datawizard/coherency.c

@@ -179,7 +179,6 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
 
 	/* the data is present now */
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
 	unsigned requesting_node = requesting_replicate->memory_node;
-	requesting_replicate->requested &= ~(1UL << requesting_node);
 
 
 	if (mode & STARPU_W)
 	if (mode & STARPU_W)
 	{
 	{
@@ -406,16 +405,18 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * indicate either the source of the request, or the destination for a
  * write-only request. */
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, struct starpu_task *task, enum starpu_is_prefetch is_prefetch)
 {
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 
 
-	r = replicate->request[node];
-
-	if (r)
+	for (r = replicate->request[node]; r; r = r->next_same_req)
 	{
 	{
 		_starpu_spin_checklocked(&r->handle->header_lock);
 		_starpu_spin_checklocked(&r->handle->header_lock);
 
 
+		if (task && r->task && task != r->task)
+			/* Do not collapse requests for different tasks */
+			continue;
+
 		_starpu_spin_lock(&r->lock);
 		_starpu_spin_lock(&r->lock);
 
 
                 /* perhaps we need to "upgrade" the request */
                 /* perhaps we need to "upgrade" the request */
@@ -440,9 +441,12 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 
 		if (mode & STARPU_W)
 		if (mode & STARPU_W)
 			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
 			r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int)  STARPU_W);
+
+		/* We collapse with this request */
+		return r;
 	}
 	}
 
 
-	return r;
+	return NULL;
 }
 }
 
 
 
 
@@ -469,7 +473,9 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
+								  enum starpu_data_access_mode mode,
+								  struct starpu_task *task,
+								  enum starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
 {
@@ -493,8 +499,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		unsigned nnodes = starpu_memory_nodes_get_count();
 		unsigned nnodes = starpu_memory_nodes_get_count();
 		for (i = 0; i < nnodes; i++)
 		for (i = 0; i < nnodes; i++)
 			for (j = 0; j < nnodes; j++)
 			for (j = 0; j < nnodes; j++)
-				if (handle->per_node[i].request[j])
+			{
+				struct _starpu_data_request *r;
+				for (r = handle->per_node[i].request[j]; r; r = r->next_same_req)
 					nwait++;
 					nwait++;
+			}
 		/* If the request is not detached (i.e. the caller really wants
 		/* If the request is not detached (i.e. the caller really wants
 		 * proper ownership), no new requests will appear because a
 		 * proper ownership), no new requests will appear because a
 		 * reference will be kept on the dst replicate, which will
 		 * reference will be kept on the dst replicate, which will
@@ -531,6 +540,25 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
 
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
 			}
 			}
+
+			if (task)
+			{
+				unsigned j;
+				unsigned nnodes = starpu_memory_nodes_get_count();
+				/* Cancel any existing (prefetch) request */
+				struct _starpu_data_request *r2;
+				for (j = 0; j < nnodes; j++)
+				{
+					for (r2 = dst_replicate->request[j]; r2; r2 = r2->next_same_req)
+					{
+						if (r2->task && r2->task == task)
+						{
+							r2->canceled = 1;
+							break;
+						}
+					}
+				}
+			}
 		}
 		}
 
 
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
@@ -568,11 +596,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
 			dst_replicate->initialized = 1;
-		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
+			&& !_starpu_malloc_willpin_on_node(requesting_node))
 		{
 		{
-			/* And this is the main RAM, really no need for a
-			 * request, just allocate */
-			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
+			/* And this is the main RAM without pinning, really no need for a
+			 * request, just quickly allocate and be done */
+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
 			{
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
 				_starpu_update_data_state(handle, dst_replicate, mode);
 				if (dst_replicate->mc)
 				if (dst_replicate->mc)
@@ -629,9 +658,12 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
 		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
 
 
 		/* Try to reuse a request if possible */
 		/* Try to reuse a request if possible */
+#ifdef STARPU_DEVEL
+#warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
+#endif
 		r = _starpu_search_existing_data_request(hop_dst_replicate,
 		r = _starpu_search_existing_data_request(hop_dst_replicate,
 				(mode & STARPU_R)?hop_src_node:hop_dst_node,
 				(mode & STARPU_R)?hop_src_node:hop_dst_node,
-							 mode, is_prefetch);
+							 mode, task, is_prefetch);
 
 
 		reused_requests[hop] = !!r;
 		reused_requests[hop] = !!r;
 
 
@@ -640,7 +672,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			/* Create a new request if there was no request to reuse */
 			/* Create a new request if there was no request to reuse */
 			r = _starpu_create_data_request(handle, hop_src_replicate,
 			r = _starpu_create_data_request(handle, hop_src_replicate,
 							hop_dst_replicate, hop_handling_node,
 							hop_dst_replicate, hop_handling_node,
-							mode, ndeps, is_prefetch, prio, 0, origin);
+							mode, ndeps, task, is_prefetch, prio, 0, origin);
 			nwait++;
 			nwait++;
 		}
 		}
 
 
@@ -686,7 +718,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		 */
 		 */
 		struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
 		struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
 							dst_replicate, requesting_node,
 							dst_replicate, requesting_node,
-							STARPU_W, nwait, is_prefetch, prio, 1, origin);
+							STARPU_W, nwait, task, is_prefetch, prio, 1, origin);
 
 
 		/* and perform the callback after termination */
 		/* and perform the callback after termination */
 		_starpu_data_request_append_callback(r, callback_func, callback_arg);
 		_starpu_data_request_append_callback(r, callback_func, callback_arg);
@@ -701,8 +733,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		for (i = 0; i < nnodes; i++)
 		for (i = 0; i < nnodes; i++)
 			for (j = 0; j < nnodes; j++)
 			for (j = 0; j < nnodes; j++)
 			{
 			{
-				struct _starpu_data_request *r2 = handle->per_node[i].request[j];
-				if (r2)
+				struct _starpu_data_request *r2;
+				for (r2 = handle->per_node[i].request[j]; r2; r2 = r2->next_same_req)
 				{
 				{
 					_starpu_spin_lock(&r2->lock);
 					_starpu_spin_lock(&r2->lock);
 					if (is_prefetch < r2->prefetch)
 					if (is_prefetch < r2->prefetch)
@@ -736,7 +768,8 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 }
 
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached,
+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
@@ -745,7 +778,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 	{
 		cpt++;
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
@@ -790,7 +823,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
 
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
 	r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
-						 is_prefetch, async, callback_func, callback_arg, prio, origin);
+						 task, is_prefetch, async, callback_func, callback_arg, prio, origin);
 
 
 	/* If no request was created, the handle was already up-to-date on the
 	/* If no request was created, the handle was already up-to-date on the
 	 * node. In this case, _starpu_create_request_to_fetch_data has already
 	 * node. In this case, _starpu_create_request_to_fetch_data has already
@@ -805,24 +838,24 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
         return ret;
         return ret;
 }
 }
 
 
-static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
 }
 }
 
 
-static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
 }
 }
 
 
-static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
 }
 }
 
 
-static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
 {
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, task, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
 }
 }
 
 
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
@@ -861,8 +894,15 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 {
 	uint32_t wt_mask;
 	uint32_t wt_mask;
+	size_t max_wt_mask = sizeof(wt_mask) * 8;
+	unsigned wt_count = starpu_memory_nodes_get_count();
+	if (max_wt_mask > STARPU_MAXNODES)
+		max_wt_mask = STARPU_MAXNODES;
+	if (wt_count > max_wt_mask)
+		wt_count = max_wt_mask;
+
 	wt_mask = default_wt_mask | handle->wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
-	wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
+	wt_mask &= (1ULL<<max_wt_mask)-1;
 
 
 	/* Note that it is possible that there is no valid copy of the data (if
 	/* Note that it is possible that there is no valid copy of the data (if
 	 * starpu_data_invalidate was called for instance). In that case, we do
 	 * starpu_data_invalidate was called for instance). In that case, we do
@@ -871,14 +911,14 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	unsigned memory_node = replicate->memory_node;
 	unsigned memory_node = replicate->memory_node;
 
 
 	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
 	if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
-	if (wt_mask & ~(1<<memory_node))
+	if (wt_mask && (memory_node >= max_wt_mask || wt_mask & ~(1<<memory_node)))
 		_starpu_write_through_data(handle, memory_node, wt_mask);
 		_starpu_write_through_data(handle, memory_node, wt_mask);
 
 
 	int cpt = 0;
 	int cpt = 0;
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 	{
 		cpt++;
 		cpt++;
-		_starpu_datawizard_progress(1);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
@@ -897,26 +937,6 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
-static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
-{
-	int cpt = 0;
-	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
-	{
-		cpt++;
-		_starpu_datawizard_progress(1);
-	}
-	if (cpt == STARPU_SPIN_MAXTRY)
-		_starpu_spin_lock(&handle->header_lock);
-
-	if (replicate->state == STARPU_INVALID)
-	{
-		unsigned dst_node = replicate->memory_node;
-		replicate->requested |= 1UL << dst_node;
-	}
-
-	_starpu_spin_unlock(&handle->header_lock);
-}
-
 int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
 int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
 {
 {
 #ifdef STARPU_OPENMP
 #ifdef STARPU_OPENMP
@@ -945,12 +965,9 @@ int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node,
 
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		if (prefetch == STARPU_PREFETCH)
 		if (prefetch == STARPU_PREFETCH)
-		{
-			task_prefetch_data_on_node(handle, node, replicate, mode, prio);
-			_starpu_set_data_requested_flag_if_needed(handle, replicate);
-		}
+			task_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
 		else
 		else
-			idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
+			idle_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
 	}
 	}
 
 
 	if (prefetch == STARPU_PREFETCH)
 	if (prefetch == STARPU_PREFETCH)
@@ -1117,8 +1134,8 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
 
 		if (async)
 		if (async)
 		{
 		{
-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
-					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1,
+					_starpu_fetch_task_input_cb, worker, task->priority, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
 			if (_starpu_simgrid_fetching_input_cost())
 				starpu_sleep(0.000001);
 				starpu_sleep(0.000001);
@@ -1133,7 +1150,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 		}
 		}
 		else
 		else
 		{
 		{
-			ret = fetch_data(handle, node, local_replicate, mode, 0);
+			ret = fetch_data(handle, node, local_replicate, mode, task, task->priority);
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
 			if (_starpu_simgrid_fetching_input_cost())
 				starpu_sleep(0.000001);
 				starpu_sleep(0.000001);
@@ -1371,7 +1388,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
 
 		local_replicate = get_replicate(handle, mode, -1, node);
 		local_replicate = get_replicate(handle, mode, -1, node);
 
 
-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
 	}
 	}
 
 
 	if (profiling && task->profiling_info)
 	if (profiling && task->profiling_info)
@@ -1421,7 +1438,7 @@ unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
 
 
 		for (i = 0; i < nnodes; i++)
 		for (i = 0; i < nnodes; i++)
 		{
 		{
-			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
+			if (handle->per_node[node].request[i])
 				ret = 1;
 				ret = 1;
 		}
 		}
 
 

+ 10 - 10
src/datawizard/coherency.h

@@ -72,15 +72,13 @@ struct _starpu_data_replicate
 	 * */
 	 * */
 	unsigned automatically_allocated:1;
 	unsigned automatically_allocated:1;
 
 
-	/** To help the scheduling policies to make some decision, we
-	   may keep a track of the tasks that are likely to request
-	   this data on the current node.
-	   It is the responsability of the scheduling _policy_ to set that
-	   flag when it assigns a task to a queue, policies which do not
-	   use this hint can simply ignore it.
-	 */
-	uint32_t requested;
+	/** This tracks the list of requests to provide the value */
 	struct _starpu_data_request *request[STARPU_MAXNODES];
 	struct _starpu_data_request *request[STARPU_MAXNODES];
+	/** This points to the last entry of request, to easily append to the list */
+	struct _starpu_data_request *last_request[STARPU_MAXNODES];
+
+	/* Which request is loading data here */
+	struct _starpu_data_request *load_request;
 
 
 	/** The number of prefetches that we made for this replicate for various tasks
 	/** The number of prefetches that we made for this replicate for various tasks
 	 * This is also the number of tasks that we will wait to see use the mc before
 	 * This is also the number of tasks that we will wait to see use the mc before
@@ -322,7 +320,8 @@ struct _starpu_data_state
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached,
+			       struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -369,7 +368,8 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
  */
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
+								  enum starpu_data_access_mode mode,
+								  struct starpu_task *task, enum starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 
 

+ 3 - 3
src/datawizard/copy_driver.c

@@ -200,7 +200,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									struct _starpu_data_replicate *dst_replicate,
 									struct _starpu_data_replicate *dst_replicate,
 									unsigned donotread,
 									unsigned donotread,
 									struct _starpu_data_request *req,
 									struct _starpu_data_request *req,
-									unsigned may_alloc,
+									enum _starpu_may_alloc may_alloc,
 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 									enum starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	if (!donotread)
 	if (!donotread)
@@ -215,11 +215,11 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 	/* first make sure the destination has an allocated buffer */
 	/* first make sure the destination has an allocated buffer */
 	if (!dst_replicate->allocated)
 	if (!dst_replicate->allocated)
 	{
 	{
-		if (!may_alloc || _starpu_is_reclaiming(dst_node))
+		if (may_alloc==STARPU_DATAWIZARD_DO_NOT_ALLOC || _starpu_is_reclaiming(dst_node))
 			/* We're not supposed to allocate there at the moment */
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 			return -ENOMEM;
 
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, prefetch, may_alloc==STARPU_DATAWIZARD_ONLY_FAST_ALLOC);
 		if (ret_alloc)
 		if (ret_alloc)
 			return -ENOMEM;
 			return -ENOMEM;
 	}
 	}

+ 8 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,13 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
+enum _starpu_may_alloc
+{
+	STARPU_DATAWIZARD_DO_NOT_ALLOC,
+	STARPU_DATAWIZARD_DO_ALLOC,
+	STARPU_DATAWIZARD_ONLY_FAST_ALLOC
+};
+
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
  * mark is used to wait asynchronous request.
@@ -131,7 +138,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    struct _starpu_data_replicate *dst_replicate,
 				    struct _starpu_data_replicate *dst_replicate,
 				    unsigned donotread,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
 				    struct _starpu_data_request *req,
-				    unsigned may_alloc,
+				    enum _starpu_may_alloc may_alloc,
 				    enum starpu_is_prefetch prefetch);
 				    enum starpu_is_prefetch prefetch);
 
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);

+ 306 - 179
src/datawizard/data_request.c

@@ -25,57 +25,67 @@
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 
 
 /* requests that have not been treated at all */
 /* requests that have not been treated at all */
-#ifdef STARPU_DEVEL
-#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
-#endif
-static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
-static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
-static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES][STARPU_MAXNODES][2]; /* Contains both task_prefetch and prefetch */
+static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
 
 
 /* requests that are not terminated (eg. async transfers) */
 /* requests that are not terminated (eg. async transfers) */
-static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES];
-static unsigned data_requests_npending[STARPU_MAXNODES];
-static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list data_requests_pending[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static unsigned data_requests_npending[STARPU_MAXNODES][STARPU_MAXNODES][2];
+static starpu_pthread_mutex_t data_requests_pending_list_mutex[STARPU_MAXNODES][STARPU_MAXNODES][2];
 
 
 void _starpu_init_data_request_lists(void)
 void _starpu_init_data_request_lists(void)
 {
 {
-	unsigned i;
+	unsigned i, j;
+	enum _starpu_data_request_inout k;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
 	{
-		_starpu_data_request_prio_list_init(&data_requests[i]);
-		_starpu_data_request_prio_list_init(&prefetch_requests[i]);
-		_starpu_data_request_prio_list_init(&idle_requests[i]);
+		for (j = 0; j < STARPU_MAXNODES; j++)
+		{
+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
+			{
+				_starpu_data_request_prio_list_init(&data_requests[i][j][k]);
+				_starpu_data_request_prio_list_init(&prefetch_requests[i][j][k]);
+				_starpu_data_request_prio_list_init(&idle_requests[i][j][k]);
 
 
 #ifndef STARPU_DEBUG
 #ifndef STARPU_DEBUG
-		/* Tell helgrind that we are fine with checking for list_empty
-		 * in _starpu_handle_node_data_requests, we will call it
-		 * periodically anyway */
-		STARPU_HG_DISABLE_CHECKING(data_requests[i].tree.root);
-		STARPU_HG_DISABLE_CHECKING(prefetch_requests[i].tree.root);
-		STARPU_HG_DISABLE_CHECKING(idle_requests[i].tree.root);
+				/* Tell helgrind that we are fine with checking for list_empty
+				 * in _starpu_handle_node_data_requests, we will call it
+				 * periodically anyway */
+				STARPU_HG_DISABLE_CHECKING(data_requests[i][j][k].tree.root);
+				STARPU_HG_DISABLE_CHECKING(prefetch_requests[i][j][k].tree.root);
+				STARPU_HG_DISABLE_CHECKING(idle_requests[i][j][k].tree.root);
 #endif
 #endif
+				_starpu_data_request_prio_list_init(&data_requests_pending[i][j][k]);
+				data_requests_npending[i][j][k] = 0;
 
 
-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i], NULL);
-
-		_starpu_data_request_prio_list_init(&data_requests_pending[i]);
-		data_requests_npending[i] = 0;
-		STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i], NULL);
+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_list_mutex[i][j][k], NULL);
+				STARPU_PTHREAD_MUTEX_INIT(&data_requests_pending_list_mutex[i][j][k], NULL);
+			}
+		}
 	}
 	}
 	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
 	STARPU_HG_DISABLE_CHECKING(data_requests_npending);
 }
 }
 
 
 void _starpu_deinit_data_request_lists(void)
 void _starpu_deinit_data_request_lists(void)
 {
 {
-	unsigned i;
+	unsigned i, j;
+	enum _starpu_data_request_inout k;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
 	{
-		_starpu_data_request_prio_list_deinit(&data_requests[i]);
-		_starpu_data_request_prio_list_deinit(&prefetch_requests[i]);
-		_starpu_data_request_prio_list_deinit(&idle_requests[i]);
-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i]);
-		_starpu_data_request_prio_list_deinit(&data_requests_pending[i]);
-		STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i]);
+		for (j = 0; j < STARPU_MAXNODES; j++)
+		{
+			for (k = _STARPU_DATA_REQUEST_IN; k <= _STARPU_DATA_REQUEST_OUT; k++)
+			{
+				_starpu_data_request_prio_list_deinit(&data_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&prefetch_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&idle_requests[i][j][k]);
+				_starpu_data_request_prio_list_deinit(&data_requests_pending[i][j][k]);
+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_pending_list_mutex[i][j][k]);
+				STARPU_PTHREAD_MUTEX_DESTROY(&data_requests_list_mutex[i][j][k]);
+			}
+		}
 	}
 	}
 }
 }
 
 
@@ -92,23 +102,39 @@ static void _starpu_data_request_unlink(struct _starpu_data_request *r)
 		STARPU_ASSERT(r->mode == STARPU_W);
 		STARPU_ASSERT(r->mode == STARPU_W);
 		r->handle->write_invalidation_req = NULL;
 		r->handle->write_invalidation_req = NULL;
 	}
 	}
-	else if (r->mode & STARPU_R)
-	{
-		/* If this is a read request, we store the pending requests
-		 * between src and dst. */
-		unsigned node = r->src_replicate->memory_node;
-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
-		r->dst_replicate->request[node] = NULL;
-	}
 	else
 	else
 	{
 	{
-		/* If this is a write only request, then there is no source and
-		 * we use the destination node to cache the request. */
-		unsigned node = r->dst_replicate->memory_node;
-		STARPU_ASSERT(r->dst_replicate->request[node] == r);
-		r->dst_replicate->request[node] = NULL;
-	}
+		unsigned node;
+		struct _starpu_data_request **prevp, *prev;
+
+		if (r->mode & STARPU_R)
+			/* If this is a read request, we store the pending requests
+			 * between src and dst. */
+			node = r->src_replicate->memory_node;
+		else
+			/* If this is a write only request, then there is no source and
+			 * we use the destination node to cache the request. */
+			node = r->dst_replicate->memory_node;
+
+		/* Look for ourself in the list, we should be not very far. */
+		for (prevp = &r->dst_replicate->request[node], prev = NULL;
+		     *prevp && *prevp != r;
+		     prev = *prevp, prevp = &prev->next_same_req)
+			;
 
 
+		STARPU_ASSERT(*prevp == r);
+		*prevp = r->next_same_req;
+
+		if (!r->next_same_req)
+		{
+			/* I was last */
+			STARPU_ASSERT(r->dst_replicate->last_request[node] == r);
+			if (prev)
+				r->dst_replicate->last_request[node] = prev;
+			else
+				r->dst_replicate->last_request[node] = NULL;
+		}
+	}
 }
 }
 
 
 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
 static void _starpu_data_request_destroy(struct _starpu_data_request *r)
@@ -124,6 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned ndeps,
+							 struct starpu_task *task,
 							 enum starpu_is_prefetch is_prefetch,
 							 enum starpu_is_prefetch is_prefetch,
 							 int prio,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 unsigned is_write_invalidation,
@@ -135,7 +162,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
 
 	_starpu_spin_init(&r->lock);
 	_starpu_spin_init(&r->lock);
 
 
-	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch);
+	_STARPU_TRACE_DATA_REQUEST_CREATED(handle, src_replicate?src_replicate->memory_node:-1, dst_replicate?dst_replicate->memory_node:-1, prio, is_prefetch, r);
 
 
 	r->origin = origin;
 	r->origin = origin;
 	r->handle = handle;
 	r->handle = handle;
@@ -153,22 +180,48 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	if (handling_node == -1)
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
 	r->handling_node = handling_node;
+	if (is_write_invalidation)
+	{
+		r->peer_node = handling_node;
+		r->inout = _STARPU_DATA_REQUEST_IN;
+	}
+	else if (dst_replicate->memory_node == handling_node)
+	{
+		if (src_replicate)
+			r->peer_node = src_replicate->memory_node;
+		else
+			r->peer_node = handling_node;
+		r->inout = _STARPU_DATA_REQUEST_IN;
+	}
+	else
+	{
+		r->peer_node = dst_replicate->memory_node;
+		r->inout = _STARPU_DATA_REQUEST_OUT;
+	}
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->completed = 0;
+	r->added_ref = 0;
+	r->canceled = 0;
 	r->prefetch = is_prefetch;
 	r->prefetch = is_prefetch;
+	r->task = task;
 	r->nb_tasks_prefetch = 0;
 	r->nb_tasks_prefetch = 0;
 	r->prio = prio;
 	r->prio = prio;
 	r->retval = -1;
 	r->retval = -1;
 	r->ndeps = ndeps;
 	r->ndeps = ndeps;
+	r->next_same_req = NULL;
 	r->next_req_count = 0;
 	r->next_req_count = 0;
 	r->callbacks = NULL;
 	r->callbacks = NULL;
 	r->com_id = 0;
 	r->com_id = 0;
 
 
 	_starpu_spin_lock(&r->lock);
 	_starpu_spin_lock(&r->lock);
 
 
-	/* Take a reference on the target for the request to be able to write it */
-	if (dst_replicate)
+	/* For a fetch, take a reference as soon as now on the target, to avoid
+	 * replicate eviction */
+	if (is_prefetch == STARPU_FETCH && dst_replicate)
+	{
+		r->added_ref = 1;
 		dst_replicate->refcnt++;
 		dst_replicate->refcnt++;
+	}
 	handle->busy_count++;
 	handle->busy_count++;
 
 
 	if (is_write_invalidation)
 	if (is_write_invalidation)
@@ -176,20 +229,28 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 		STARPU_ASSERT(!handle->write_invalidation_req);
 		STARPU_ASSERT(!handle->write_invalidation_req);
 		handle->write_invalidation_req = r;
 		handle->write_invalidation_req = r;
 	}
 	}
-	else if (mode & STARPU_R)
-	{
-		unsigned src_node = src_replicate->memory_node;
-		STARPU_ASSERT(!dst_replicate->request[src_node]);
-		dst_replicate->request[src_node] = r;
-		/* Take a reference on the source for the request to be able to read it */
-		src_replicate->refcnt++;
-		handle->busy_count++;
-	}
 	else
 	else
 	{
 	{
-		unsigned dst_node = dst_replicate->memory_node;
-		STARPU_ASSERT(!dst_replicate->request[dst_node]);
-		dst_replicate->request[dst_node] = r;
+		unsigned node;
+
+		if (mode & STARPU_R)
+			node = src_replicate->memory_node;
+		else
+			node = dst_replicate->memory_node;
+
+		if (!dst_replicate->request[node])
+			dst_replicate->request[node] = r;
+		else
+			dst_replicate->last_request[node]->next_same_req = r;
+		dst_replicate->last_request[node] = r;
+
+		if (mode & STARPU_R)
+		{
+			/* Take a reference on the source for the request to be
+			 * able to read it */
+			src_replicate->refcnt++;
+			handle->busy_count++;
+		}
 	}
 	}
 
 
 	r->refcnt = 1;
 	r->refcnt = 1;
@@ -199,7 +260,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	return r;
 	return r;
 }
 }
 
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc)
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 {
 	int retval;
 	int retval;
 	int do_delete = 0;
 	int do_delete = 0;
@@ -310,14 +371,14 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 	}
 	}
 
 
 	/* insert the request in the proper list */
 	/* insert the request in the proper list */
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
 	if (r->prefetch >= STARPU_IDLEFETCH)
 	if (r->prefetch >= STARPU_IDLEFETCH)
-		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
+		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node][r->peer_node][r->inout], r);
 	else if (r->prefetch > STARPU_FETCH)
 	else if (r->prefetch > STARPU_FETCH)
-		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
+		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
 	else
 	else
-		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node]);
+		_starpu_data_request_prio_list_push_back(&data_requests[handling_node][r->peer_node][r->inout], r);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][r->peer_node][r->inout]);
 
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	_starpu_wake_all_blocked_workers_on_node(handling_node);
 	_starpu_wake_all_blocked_workers_on_node(handling_node);
@@ -352,7 +413,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
 
 
 
-	if (dst_replicate)
+	if (r->canceled < 2 && dst_replicate)
 	{
 	{
 #ifdef STARPU_MEMORY_STATS
 #ifdef STARPU_MEMORY_STATS
 		enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
 		enum _starpu_cache_state old_src_replicate_state = src_replicate->state;
@@ -360,6 +421,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
 
 		_starpu_spin_checklocked(&handle->header_lock);
 		_starpu_spin_checklocked(&handle->header_lock);
 		_starpu_update_data_state(handle, r->dst_replicate, mode);
 		_starpu_update_data_state(handle, r->dst_replicate, mode);
+		dst_replicate->load_request = NULL;
 
 
 #ifdef STARPU_MEMORY_STATS
 #ifdef STARPU_MEMORY_STATS
 		if (src_replicate->state == STARPU_INVALID)
 		if (src_replicate->state == STARPU_INVALID)
@@ -382,7 +444,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 #endif
 #endif
 	}
 	}
 
 
-	if (r->com_id > 0)
+	if (r->canceled < 2 && r->com_id > 0)
 	{
 	{
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 		unsigned src_node = src_replicate->memory_node;
 		unsigned src_node = src_replicate->memory_node;
@@ -414,12 +476,15 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	/* Remove a reference on the destination replicate for the request */
 	/* Remove a reference on the destination replicate for the request */
 	if (dst_replicate)
 	if (dst_replicate)
 	{
 	{
-		if (dst_replicate->mc)
+		if (r->canceled < 2 && dst_replicate->mc)
 			/* Make sure it stays there for the task.  */
 			/* Make sure it stays there for the task.  */
 			dst_replicate->nb_tasks_prefetch += r->nb_tasks_prefetch;
 			dst_replicate->nb_tasks_prefetch += r->nb_tasks_prefetch;
 
 
-		STARPU_ASSERT(dst_replicate->refcnt > 0);
-		dst_replicate->refcnt--;
+		if (r->added_ref)
+		{
+			STARPU_ASSERT(dst_replicate->refcnt > 0);
+			dst_replicate->refcnt--;
+		}
 	}
 	}
 	STARPU_ASSERT(handle->busy_count > 0);
 	STARPU_ASSERT(handle->busy_count > 0);
 	handle->busy_count--;
 	handle->busy_count--;
@@ -467,8 +532,16 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	}
 	}
 }
 }
 
 
+void _starpu_data_request_complete_wait(void *arg)
+{
+	struct _starpu_data_request *r = arg;
+	_starpu_spin_lock(&r->handle->header_lock);
+	_starpu_spin_lock(&r->lock);
+	starpu_handle_data_request_completion(r);
+}
+
 /* TODO : accounting to see how much time was spent working for other people ... */
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum starpu_is_prefetch prefetch)
+static int starpu_handle_data_request(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc)
 {
 {
 	starpu_data_handle_t handle = r->handle;
 	starpu_data_handle_t handle = r->handle;
 
 
@@ -491,12 +564,50 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 	struct _starpu_data_replicate *dst_replicate = r->dst_replicate;
 
 
+	if (r->canceled)
+	{
+		/* Ok, canceled before starting copies etc. */
+		r->canceled = 2;
+		/* Nothing left to do */
+		starpu_handle_data_request_completion(r);
+		return 0;
+	}
+
+	if (dst_replicate)
+	{
+		struct _starpu_data_request *r2 = dst_replicate->load_request;
+		if (r2 && r2 != r)
+		{
+			/* Oh, some other transfer is already loading the value. Just wait for it */
+			r->canceled = 2;
+			_starpu_spin_unlock(&r->lock);
+			_starpu_spin_lock(&r2->lock);
+			_starpu_data_request_append_callback(r2, _starpu_data_request_complete_wait, r);
+			_starpu_spin_unlock(&r2->lock);
+			_starpu_spin_unlock(&handle->header_lock);
+			return 0;
+		}
+
+		/* We are loading this replicate.
+		 * Note: we might fail to allocate memory, but we will keep on and others will wait for us. */
+		dst_replicate->load_request = r;
+	}
+
 	enum starpu_data_access_mode r_mode = r->mode;
 	enum starpu_data_access_mode r_mode = r->mode;
 
 
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
 
 
+	/* For prefetches, we take a reference on the destination only now that
+	 * we will really try to fetch the data (instead of in
+	 * _starpu_create_data_request) */
+	if (dst_replicate && r->prefetch > STARPU_FETCH)
+	{
+		r->added_ref = 1;	/* Note: we might get upgraded while trying to allocate */
+		dst_replicate->refcnt++;
+	}
+
 	_starpu_spin_unlock(&r->lock);
 	_starpu_spin_unlock(&r->lock);
 
 
 	/* FIXME: the request may get upgraded from here to freeing it... */
 	/* FIXME: the request may get upgraded from here to freeing it... */
@@ -507,7 +618,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
 
 	if (dst_replicate && dst_replicate->state == STARPU_INVALID)
 	if (dst_replicate && dst_replicate->state == STARPU_INVALID)
 		r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
 		r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
-						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, prefetch);
+						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc, r->prefetch);
 	else
 	else
 		/* Already valid actually, no need to transfer anything */
 		/* Already valid actually, no need to transfer anything */
 		r->retval = 0;
 		r->retval = 0;
@@ -516,6 +627,15 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	{
 	{
 		/* If there was not enough memory, we will try to redo the
 		/* If there was not enough memory, we will try to redo the
 		 * request later. */
 		 * request later. */
+
+		if (r->prefetch > STARPU_FETCH)
+		{
+			STARPU_ASSERT(r->added_ref);
+			/* Drop ref until next try */
+			r->added_ref = 0;
+			dst_replicate->refcnt--;
+		}
+
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
@@ -528,10 +648,10 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 		 * requests in the meantime. */
 		 * requests in the meantime. */
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
-		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node], r);
-		data_requests_npending[r->handling_node]++;
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node]);
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
+		_starpu_data_request_prio_list_push_back(&data_requests_pending[r->handling_node][r->peer_node][r->inout], r);
+		data_requests_npending[r->handling_node][r->peer_node][r->inout]++;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 
 		return -EAGAIN;
 		return -EAGAIN;
 	}
 	}
@@ -543,10 +663,9 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 	return 0;
 }
 }
 
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list reqlist[STARPU_MAXNODES][STARPU_MAXNODES][2], unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned n, unsigned *pushed, enum starpu_is_prefetch prefetch)
 {
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
-	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
 	unsigned i;
 	unsigned i;
 	int ret = 0;
 	int ret = 0;
 
 
@@ -556,48 +675,55 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	/* This is racy, but not posing problems actually, since we know we
 	/* This is racy, but not posing problems actually, since we know we
 	 * will come back here to probe again regularly anyway.
 	 * will come back here to probe again regularly anyway.
 	 * Thus, do not expose this optimization to helgrind */
 	 * Thus, do not expose this optimization to helgrind */
-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[src_node]))
+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
 		return 0;
 		return 0;
 #endif
 #endif
 
 
-	/* TODO optimize */
+	/* We create a new list to pickup some requests from the main list, and
+	 * we handle the request(s) one by one from it, without concurrency issues.
+	 */
+	struct _starpu_data_request_list local_list, remain_list;
+	_starpu_data_request_list_init(&local_list);
 
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	/* take all the entries from the request list */
 	/* take all the entries from the request list */
-	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]))
 	{
 	{
 		/* List is busy, do not bother with it */
 		/* List is busy, do not bother with it */
 		return -EBUSY;
 		return -EBUSY;
 	}
 	}
 #else
 #else
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 #endif
 #endif
 
 
-	if (_starpu_data_request_prio_list_empty(&reqlist[src_node]))
+	for (i = data_requests_npending[handling_node][peer_node][inout];
+		i < n && ! _starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]);
+		i++)
 	{
 	{
-		/* there is no request */
-                STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
-		return 0;
+		r = _starpu_data_request_prio_list_pop_front_highest(&reqlist[handling_node][peer_node][inout]);
+		_starpu_data_request_list_push_back(&local_list, r);
 	}
 	}
 
 
-	/* There is an entry: we create a new empty list to replace the list of
-	 * requests, and we handle the request(s) one by one in the former
-	 * list, without concurrency issues.*/
-	struct _starpu_data_request_prio_list local_list = reqlist[src_node];
-	_starpu_data_request_prio_list_init(&reqlist[src_node]);
+	if (!_starpu_data_request_prio_list_empty(&reqlist[handling_node][peer_node][inout]))
+		/* We have left some requests */
+		ret = -EBUSY;
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+	if (_starpu_data_request_list_empty(&local_list))
+		/* there is no request */
+		return 0;
 
 
-	for (i = 0; i <= prefetch; i++)
-		_starpu_data_request_prio_list_init(&new_data_requests[i]);
+	/* This will contain the remaining requests */
+	_starpu_data_request_list_init(&remain_list);
 
 
 	double start = starpu_timing_now();
 	double start = starpu_timing_now();
 	/* for all entries of the list */
 	/* for all entries of the list */
-	while (!_starpu_data_request_prio_list_empty(&local_list))
+	while (!_starpu_data_request_list_empty(&local_list))
 	{
 	{
                 int res;
                 int res;
 
 
-		if (data_requests_npending[src_node] >= n)
+		if (data_requests_npending[handling_node][peer_node][inout] >= n)
 		{
 		{
 			/* Too many requests at the same time, skip pushing
 			/* Too many requests at the same time, skip pushing
 			 * more for now */
 			 * more for now */
@@ -605,21 +731,22 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			break;
 			break;
 		}
 		}
 
 
-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
+		r = _starpu_data_request_list_pop_front(&local_list);
 
 
-		res = starpu_handle_data_request(r, may_alloc, prefetch);
+		res = starpu_handle_data_request(r, may_alloc);
 		if (res != 0 && res != -EAGAIN)
 		if (res != 0 && res != -EAGAIN)
 		{
 		{
 			/* handle is busy, or not enough memory, postpone for now */
 			/* handle is busy, or not enough memory, postpone for now */
 			ret = res;
 			ret = res;
 			/* Prefetch requests might have gotten promoted while in tmp list */
 			/* Prefetch requests might have gotten promoted while in tmp list */
-			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
+			_starpu_data_request_list_push_back(&remain_list, r);
 			if (prefetch > STARPU_FETCH)
 			if (prefetch > STARPU_FETCH)
 				/* Prefetching more there would make the situation even worse */
 				/* Prefetching more there would make the situation even worse */
 				break;
 				break;
 		}
 		}
+		else
+			(*pushed)++;
 
 
-		(*pushed)++;
 		if (starpu_timing_now() - start >= MAX_PUSH_TIME)
 		if (starpu_timing_now() - start >= MAX_PUSH_TIME)
 		{
 		{
 			/* We have spent a lot of time doing requests, skip pushing more for now */
 			/* We have spent a lot of time doing requests, skip pushing more for now */
@@ -628,43 +755,23 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 		}
 		}
 	}
 	}
 
 
-	/* Push back requests we didn't handle on the proper list */
-	while (!_starpu_data_request_prio_list_empty(&local_list))
-	{
-		r = _starpu_data_request_prio_list_pop_front_highest(&local_list);
-		/* Prefetch requests might have gotten promoted while in tmp list */
-		_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
-	}
-	_starpu_data_request_prio_list_deinit(&local_list);
-
-	for (i = 0; i <= prefetch; i++)
-		if (!_starpu_data_request_prio_list_empty(&new_data_requests[i]))
-			break;
+	/* Gather remainder */
+	_starpu_data_request_list_push_list_back(&remain_list, &local_list);
 
 
-	if (i <= prefetch)
+	if (!_starpu_data_request_list_empty(&remain_list))
 	{
 	{
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
-			data_requests[src_node] = new_data_requests[STARPU_FETCH];
-		}
-		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
+		while (!_starpu_data_request_list_empty(&remain_list))
 		{
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
-		}
-		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
-		}
-		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
-		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
-			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
+			r = _starpu_data_request_list_pop_back(&remain_list);
+			if (r->prefetch >= STARPU_IDLEFETCH)
+				_starpu_data_request_prio_list_push_front(&idle_requests[handling_node][r->peer_node][r->inout], r);
+			else if (r->prefetch > STARPU_FETCH)
+				_starpu_data_request_prio_list_push_front(&prefetch_requests[handling_node][r->peer_node][r->inout], r);
+			else
+				_starpu_data_request_prio_list_push_front(&data_requests[handling_node][r->peer_node][r->inout], r);
 		}
 		}
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[handling_node][peer_node][inout]);
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 		if (*pushed)
 		if (*pushed)
@@ -676,32 +783,32 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			 * for eviction to happen.
 			 * for eviction to happen.
 			 */
 			 */
 			starpu_sleep(0.000001);
 			starpu_sleep(0.000001);
-			_starpu_wake_all_blocked_workers_on_node(src_node);
+			_starpu_wake_all_blocked_workers_on_node(handling_node);
 		}
 		}
 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)
 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)
-		_starpu_wake_all_blocked_workers_on_node(src_node);
+		_starpu_wake_all_blocked_workers_on_node(handling_node);
 #endif
 #endif
 	}
 	}
 
 
 	return ret;
 	return ret;
 }
 }
 
 
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
+	return __starpu_handle_node_data_requests(data_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 }
 
 
-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
+	return __starpu_handle_node_data_requests(prefetch_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 }
 
 
-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed)
 {
 {
-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
+	return __starpu_handle_node_data_requests(idle_requests, handling_node, peer_node, inout, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }
 }
 
 
-static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
+static int _handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, unsigned force)
 {
 {
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
 //
@@ -712,14 +819,14 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 	/* Here helgrind would should that this is an un protected access.
 	/* Here helgrind would should that this is an un protected access.
 	 * We however don't care about missing an entry, we will get called
 	 * We however don't care about missing an entry, we will get called
 	 * again sooner or later. */
 	 * again sooner or later. */
-	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
+	if (!STARPU_RUNNING_ON_VALGRIND && _starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
 		return 0;
 		return 0;
 #endif
 #endif
 
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (!force)
 	if (!force)
 	{
 	{
-		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]))
+		if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]))
 		{
 		{
 			/* List is busy, do not bother with it */
 			/* List is busy, do not bother with it */
 			return 0;
 			return 0;
@@ -728,19 +835,19 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 	else
 	else
 #endif
 #endif
 		/* We really want to handle requests */
 		/* We really want to handle requests */
-		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
 
-	if (_starpu_data_request_prio_list_empty(&data_requests_pending[src_node]))
+	if (_starpu_data_request_prio_list_empty(&data_requests_pending[handling_node][peer_node][inout]))
 	{
 	{
 		/* there is no request */
 		/* there is no request */
-		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 		return 0;
 		return 0;
 	}
 	}
 	/* for all entries of the list */
 	/* for all entries of the list */
-	struct _starpu_data_request_prio_list local_list = data_requests_pending[src_node];
-	_starpu_data_request_prio_list_init(&data_requests_pending[src_node]);
+	struct _starpu_data_request_prio_list local_list = data_requests_pending[handling_node][peer_node][inout];
+	_starpu_data_request_prio_list_init(&data_requests_pending[handling_node][peer_node][inout]);
 
 
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
 
 	_starpu_data_request_prio_list_init(&new_data_requests_pending);
 	_starpu_data_request_prio_list_init(&new_data_requests_pending);
 	taken = 0;
 	taken = 0;
@@ -803,55 +910,75 @@ static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
 		}
 		}
 	}
 	}
 	_starpu_data_request_prio_list_deinit(&local_list);
 	_starpu_data_request_prio_list_deinit(&local_list);
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
-	data_requests_npending[src_node] -= taken - kept;
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
+	data_requests_npending[handling_node][peer_node][inout] -= taken - kept;
 	if (kept)
 	if (kept)
-		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[src_node], &new_data_requests_pending);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+		_starpu_data_request_prio_list_push_prio_list_back(&data_requests_pending[handling_node][peer_node][inout], &new_data_requests_pending);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[handling_node][peer_node][inout]);
 
 
 	return taken - kept;
 	return taken - kept;
 }
 }
 
 
-int _starpu_handle_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
 {
-	return _handle_pending_node_data_requests(src_node, 0);
+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 0);
 }
 }
 
 
-int _starpu_handle_all_pending_node_data_requests(unsigned src_node)
+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
 {
-	return _handle_pending_node_data_requests(src_node, 1);
+	return _handle_pending_node_data_requests(handling_node, peer_node, inout, 1);
 }
 }
 
 
 /* Note: the returned value will be outdated since the locks are not taken at
 /* Note: the returned value will be outdated since the locks are not taken at
  * entry/exit */
  * entry/exit */
-int _starpu_check_that_no_data_request_exists(unsigned node)
+static int __starpu_check_that_no_data_request_exists(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
 {
 	int no_request;
 	int no_request;
 	int no_pending;
 	int no_pending;
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node]);
-	no_request = _starpu_data_request_prio_list_empty(&data_requests[node])
-	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node])
-		  && _starpu_data_request_prio_list_empty(&idle_requests[node]);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node]);
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node]);
-	no_pending = !data_requests_npending[node];
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[node][peer_node][inout]);
+	no_request = _starpu_data_request_prio_list_empty(&data_requests[node][peer_node][inout])
+	          && _starpu_data_request_prio_list_empty(&prefetch_requests[node][peer_node][inout])
+		  && _starpu_data_request_prio_list_empty(&idle_requests[node][peer_node][inout]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[node][peer_node][inout]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
+	no_pending = !data_requests_npending[node][peer_node][inout];
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[node][peer_node][inout]);
 
 
 	return no_request && no_pending;
 	return no_request && no_pending;
 }
 }
 
 
+int _starpu_check_that_no_data_request_exists(unsigned node)
+{
+	unsigned peer_node, nnodes = starpu_memory_nodes_get_count();
+
+	for (peer_node = 0; peer_node < nnodes; peer_node++)
+		if (!__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_IN)
+		 || !__starpu_check_that_no_data_request_exists(node, peer_node, _STARPU_DATA_REQUEST_OUT))
+		 return 0;
+	 return 1;
+}
+
 /* Note: the returned value will be outdated since the locks are not taken at
 /* Note: the returned value will be outdated since the locks are not taken at
  * entry/exit */
  * entry/exit */
-int _starpu_check_that_no_data_request_is_pending(unsigned node)
+int _starpu_check_that_no_data_request_is_pending(unsigned node, unsigned peer_node, enum _starpu_data_request_inout inout)
 {
 {
-	return !data_requests_npending[node];
+	return !data_requests_npending[node][peer_node][inout];
 }
 }
 
 
 
 
 void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_is_prefetch prefetch)
 void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_is_prefetch prefetch)
 {
 {
+	_starpu_spin_checklocked(&r->handle->header_lock);
 	STARPU_ASSERT(r->prefetch > prefetch);
 	STARPU_ASSERT(r->prefetch > prefetch);
+
+	if (prefetch == STARPU_FETCH && !r->added_ref)
+	{
+		/* That would have been done by _starpu_create_data_request */
+		r->added_ref = 1;
+		r->dst_replicate->refcnt++;
+	}
+
 	r->prefetch=prefetch;
 	r->prefetch=prefetch;
 
 
 	if (prefetch >= STARPU_IDLEFETCH)
 	if (prefetch >= STARPU_IDLEFETCH)
@@ -867,27 +994,27 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum starpu_
 			_starpu_update_prefetch_status(next_req, prefetch);
 			_starpu_update_prefetch_status(next_req, prefetch);
 	}
 	}
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
+	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 
 	int found = 1;
 	int found = 1;
 
 
 	/* The request can be in a different list (handling request or the temp list)
 	/* The request can be in a different list (handling request or the temp list)
 	 * we have to check that it is really in the prefetch or idle list. */
 	 * we have to check that it is really in the prefetch or idle list. */
-	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
-	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
+	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r))
+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node][r->peer_node][r->inout], r);
+	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node][r->peer_node][r->inout], r))
+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node][r->peer_node][r->inout], r);
 	else
 	else
 		found = 0;
 		found = 0;
 
 
 	if (found)
 	if (found)
 	{
 	{
 		if (prefetch > STARPU_FETCH)
 		if (prefetch > STARPU_FETCH)
-			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
+			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node][r->peer_node][r->inout],r);
 		else
 		else
-			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
+			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node][r->peer_node][r->inout],r);
 	}
 	}
-	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[r->handling_node][r->peer_node][r->inout]);
 
 
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 	_starpu_wake_all_blocked_workers_on_node(r->handling_node);
 	_starpu_wake_all_blocked_workers_on_node(r->handling_node);

+ 34 - 13
src/datawizard/data_request.h

@@ -32,8 +32,8 @@
  * Data interfaces should also have to declare how many asynchronous requests
  * Data interfaces should also have to declare how many asynchronous requests
  * they have actually started (think of e.g. csr).
  * they have actually started (think of e.g. csr).
  */
  */
-#define MAX_PENDING_REQUESTS_PER_NODE 20
-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
+#define MAX_PENDING_REQUESTS_PER_NODE 5
+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 2
 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
 /** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
 /** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
 #define MAX_PUSH_TIME 1000
 #define MAX_PUSH_TIME 1000
@@ -47,6 +47,11 @@ struct _starpu_callback_list
 	struct _starpu_callback_list *next;
 	struct _starpu_callback_list *next;
 };
 };
 
 
+enum _starpu_data_request_inout
+{
+	_STARPU_DATA_REQUEST_IN, _STARPU_DATA_REQUEST_OUT
+};
+
 /** This represents a data request, i.e. we want some data to get transferred
 /** This represents a data request, i.e. we want some data to get transferred
  * from a source to a destination. */
  * from a source to a destination. */
 LIST_TYPE(_starpu_data_request,
 LIST_TYPE(_starpu_data_request,
@@ -63,6 +68,8 @@ LIST_TYPE(_starpu_data_request,
 	 * the node can make the CUDA/OpenCL calls.
 	 * the node can make the CUDA/OpenCL calls.
 	 */
 	 */
 	unsigned handling_node;
 	unsigned handling_node;
+	unsigned peer_node;
+	enum _starpu_data_request_inout inout;
 
 
 	/*
 	/*
 	 * What the destination node wants to do with the data: write to it,
 	 * What the destination node wants to do with the data: write to it,
@@ -78,10 +85,19 @@ LIST_TYPE(_starpu_data_request,
 	struct _starpu_async_channel async_channel;
 	struct _starpu_async_channel async_channel;
 
 
 	/** Whether the transfer is completed. */
 	/** Whether the transfer is completed. */
-	unsigned completed;
+	unsigned completed:1;
+
+	/** Whether we have already added our reference to the dst replicate. */
+	unsigned added_ref:1;
+
+	/** Whether the request was canceled before being handled (because the transfer already happened another way). */
+	unsigned canceled:2;
 
 
 	/** Whether this is just a prefetch request */
 	/** Whether this is just a prefetch request */
-	enum starpu_is_prefetch prefetch;
+	enum starpu_is_prefetch prefetch:3;
+
+	/** Task this request is for */
+	struct starpu_task *task;
 
 
 	/** Number of tasks which used this as a prefetch */
 	/** Number of tasks which used this as a prefetch */
 	unsigned nb_tasks_prefetch;
 	unsigned nb_tasks_prefetch;
@@ -96,6 +112,10 @@ LIST_TYPE(_starpu_data_request,
 	 * dependencies. */
 	 * dependencies. */
 	unsigned ndeps;
 	unsigned ndeps;
 
 
+	/** Some further tasks may have requested prefetches for the same data
+	 * much later on, link with them */
+	struct _starpu_data_request *next_same_req;
+
 	/** in case we have a chain of request (eg. for nvidia multi-GPU), this
 	/** in case we have a chain of request (eg. for nvidia multi-GPU), this
 	 * is the list of requests which are waiting for this one. */
 	 * is the list of requests which are waiting for this one. */
 	struct _starpu_data_request *next_req[STARPU_MAXNODES+1];
 	struct _starpu_data_request *next_req[STARPU_MAXNODES+1];
@@ -123,7 +143,7 @@ LIST_TYPE(_starpu_data_requester,
 
 
 	int prio;
 	int prio;
 
 
-	/** if this is more complicated ... (eg. application request) 
+	/** if this is more complicated ... (eg. application request)
 	 * NB: this callback is not called with the lock taken !
 	 * NB: this callback is not called with the lock taken !
 	 */
 	 */
 	void (*ready_data_callback)(void *argcb);
 	void (*ready_data_callback)(void *argcb);
@@ -135,15 +155,15 @@ void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r);
 void _starpu_post_data_request(struct _starpu_data_request *r);
 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
 /** returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
-int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
-int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed);
+int _starpu_handle_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_prefetch_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
+int _starpu_handle_node_idle_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned *pushed);
 
 
-int _starpu_handle_pending_node_data_requests(unsigned src_node);
-int _starpu_handle_all_pending_node_data_requests(unsigned src_node);
+int _starpu_handle_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
+int _starpu_handle_all_pending_node_data_requests(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
 
 
-int _starpu_check_that_no_data_request_exists(unsigned node);
-int _starpu_check_that_no_data_request_is_pending(unsigned node);
+int _starpu_check_that_no_data_request_exists(unsigned handling_node);
+int _starpu_check_that_no_data_request_is_pending(unsigned handling_node, unsigned peer_node, enum _starpu_data_request_inout inout);
 
 
 struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
 struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t handle,
 							 struct _starpu_data_replicate *src_replicate,
 							 struct _starpu_data_replicate *src_replicate,
@@ -151,12 +171,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
 							 unsigned ndeps,
+							 struct starpu_task *task,
 							 enum starpu_is_prefetch is_prefetch,
 							 enum starpu_is_prefetch is_prefetch,
 							 int prio,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
 
 
-int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigned may_alloc);
+int _starpu_wait_data_request_completion(struct _starpu_data_request *r, enum _starpu_may_alloc may_alloc);
 
 
 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),
 					  void (*callback_func)(void *),

+ 87 - 25
src/datawizard/datawizard.c

@@ -26,19 +26,17 @@
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 #endif
 #endif
 
 
-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
+static int ____starpu_datawizard_progress(unsigned memory_node, unsigned peer_start, unsigned peer_end, enum  _starpu_data_request_inout inout, enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 {
 	int ret = 0;
 	int ret = 0;
-
-#ifdef STARPU_SIMGRID
-	/* XXX */
-	starpu_sleep(0.000001);
-#endif
-	STARPU_UYIELD();
+	unsigned peer_node;
 
 
 	/* in case some other driver requested data */
 	/* in case some other driver requested data */
-	if (_starpu_handle_pending_node_data_requests(memory_node))
-		ret = 1;
+	for (peer_node = peer_start; peer_node < peer_end; peer_node++)
+	{
+		if (_starpu_handle_pending_node_data_requests(memory_node, peer_node, inout))
+			ret = 1;
+	}
 
 
 	starpu_memchunk_tidy(memory_node);
 	starpu_memchunk_tidy(memory_node);
 
 
@@ -46,26 +44,70 @@ int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsi
 	{
 	{
 		/* Some transfers have finished, or the driver requests to really push more */
 		/* Some transfers have finished, or the driver requests to really push more */
 		unsigned pushed;
 		unsigned pushed;
-		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)
+		unsigned ok = 1;
+
+		for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
 		{
 		{
+			if (_starpu_handle_node_data_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+				ok = 0;
 			if (pushed)
 			if (pushed)
 				ret = 1;
 				ret = 1;
+		}
+
+		if (ok)
+		{
+			unsigned doidle = 1;
+
 			/* We pushed all pending requests, we can afford pushing
 			/* We pushed all pending requests, we can afford pushing
 			 * prefetch requests */
 			 * prefetch requests */
-			_starpu_handle_node_prefetch_requests(memory_node, may_alloc, &pushed);
-			if (_starpu_check_that_no_data_request_is_pending(memory_node))
+			for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
+			{
+				if (_starpu_handle_node_prefetch_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+					ok = 0;
+				if (pushed)
+					ret = 1;
+				if (!_starpu_check_that_no_data_request_is_pending(memory_node, peer_node, inout))
+					doidle = 0;
+			}
+
+			if (doidle)
 				/* No pending transfer, push some idle transfer */
 				/* No pending transfer, push some idle transfer */
-				_starpu_handle_node_idle_requests(memory_node, may_alloc, &pushed);
+				for (peer_node = peer_start; ok && peer_node < peer_end; peer_node++)
+				{
+					if (_starpu_handle_node_idle_requests(memory_node, peer_node, inout, may_alloc, &pushed) == -ENOMEM)
+						ok = 0;
+					if (pushed)
+						ret = 1;
+				}
 		}
 		}
-		if (pushed)
-			ret = 1;
+
 	}
 	}
-	_starpu_execute_registered_progression_hooks();
 
 
 	return ret;
 	return ret;
 }
 }
 
 
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
+static int ___starpu_datawizard_progress(unsigned memory_node, unsigned nnodes, enum _starpu_may_alloc may_alloc, unsigned push_requests)
+{
+	int ret = 0;
+	unsigned peer_node;
+
+#ifdef STARPU_SIMGRID
+	/* XXX */
+	starpu_sleep(0.000001);
+#endif
+	STARPU_UYIELD();
+
+	/* First handle all incoming transfers */
+	ret |= ____starpu_datawizard_progress(memory_node, 0, nnodes, _STARPU_DATA_REQUEST_IN, may_alloc, push_requests);
+
+	/* Then handle outgoing transfers */
+	for (peer_node = 0; peer_node < nnodes; peer_node++)
+		ret |= ____starpu_datawizard_progress(memory_node, peer_node, peer_node+1, _STARPU_DATA_REQUEST_OUT, may_alloc, push_requests);
+
+	return ret;
+}
+
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests)
 {
 {
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
         unsigned memnode;
         unsigned memnode;
@@ -77,7 +119,8 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 		int nnumas = starpu_memory_nodes_get_numa_count();
 		int nnumas = starpu_memory_nodes_get_numa_count();
 		int numa;
 		int numa;
 		for (numa = 0; numa < nnumas; numa++)
 		for (numa = 0; numa < nnumas; numa++)
-			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
+			ret |=  ___starpu_datawizard_progress(numa, nnumas, may_alloc, push_requests);
+		_starpu_execute_registered_progression_hooks();
 
 
 		return ret;
 		return ret;
 	}
 	}
@@ -87,19 +130,38 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 		worker = &worker->set->workers[0];
 		worker = &worker->set->workers[0];
 
 
 	unsigned current_worker_id = worker->workerid;
 	unsigned current_worker_id = worker->workerid;
-        int ret = 0;
+	int ret = 0;
 	unsigned nnodes = starpu_memory_nodes_get_count();
 	unsigned nnodes = starpu_memory_nodes_get_count();
 
 
-        for (memnode = 0; memnode < nnodes; memnode++)
-        {
-                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
-                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
-        }
+	for (memnode = 0; memnode < nnodes; memnode++)
+	{
+		if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
+		{
+			if(_starpu_config.conf.cuda_only_fast_alloc_other_memnodes && worker->arch == STARPU_CUDA_WORKER && worker->memory_node != memnode)
+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, STARPU_DATAWIZARD_ONLY_FAST_ALLOC, push_requests);
+			else
+				ret |=  ___starpu_datawizard_progress(memnode, nnodes, may_alloc, push_requests);
+			}
+	}
+
+	_starpu_execute_registered_progression_hooks();
 
 
         return ret;
         return ret;
 }
 }
 
 
-void _starpu_datawizard_progress(unsigned may_alloc)
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc)
 {
 {
         __starpu_datawizard_progress(may_alloc, 1);
         __starpu_datawizard_progress(may_alloc, 1);
 }
 }
+
+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode)
+{
+	unsigned nnodes = starpu_memory_nodes_get_count();
+	unsigned memnode2;
+
+	for (memnode2 = 0; memnode2 < nnodes; memnode2++)
+	{
+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_IN);
+		_starpu_handle_all_pending_node_data_requests(memnode, memnode2, _STARPU_DATA_REQUEST_OUT);
+	}
+}

+ 8 - 7
src/datawizard/datawizard.h

@@ -34,18 +34,19 @@
 
 
 #include <core/dependencies/implicit_data_deps.h>
 #include <core/dependencies/implicit_data_deps.h>
 
 
-/** Make data transfers progress on node \p memory_node.
+
+/** Make data transfers progress on all memory nodes driven by the current worker.
  *
  *
  * If \p push_requests is 1, it can start new transfers
  * If \p push_requests is 1, it can start new transfers
  *
  *
- * If \p may_alloc is 1, it can allocate destination data for transfers
+ * If \p may_alloc is STARPU_DATAWIZARD_DO_ALLOC, it can allocate destination data for transfers
  * (this is not possible e.g. when spinning for a handle lock)
  * (this is not possible e.g. when spinning for a handle lock)
  */
  */
-int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
-/** Call ___starpu_datawizard_progress() for all memory nodes driven by the
- * current worker */
-int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
+int __starpu_datawizard_progress(enum _starpu_may_alloc may_alloc, unsigned push_requests);
 /** Call __starpu_datawizard_progress with push_requests = 1 */
 /** Call __starpu_datawizard_progress with push_requests = 1 */
-void _starpu_datawizard_progress(unsigned may_alloc);
+void _starpu_datawizard_progress(enum _starpu_may_alloc may_alloc);
+
+/** Check for all pending data request progress on node \p memory_node */
+void _starpu_datawizard_handle_all_pending_node_data_requests(unsigned memnode);
 
 
 #endif // __DATAWIZARD_H__
 #endif // __DATAWIZARD_H__

+ 1 - 1
src/datawizard/filters.c

@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH, 0);
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #warning we should reclaim memory if allocation failed
 #endif
 #endif

+ 5 - 2
src/datawizard/interfaces/data_interface.c

@@ -375,13 +375,14 @@ _starpu_data_initialize_per_worker(starpu_data_handle_t handle)
 		replicate->state = STARPU_INVALID;
 		replicate->state = STARPU_INVALID;
 		//replicate->refcnt = 0;
 		//replicate->refcnt = 0;
 		replicate->handle = handle;
 		replicate->handle = handle;
-		//replicate->requested = 0;
 		//replicate->nb_tasks_prefetch = 0;
 		//replicate->nb_tasks_prefetch = 0;
 
 
 		//for (node = 0; node < STARPU_MAXNODES; node++)
 		//for (node = 0; node < STARPU_MAXNODES; node++)
 		//{
 		//{
 		//	replicate->request[node] = NULL;
 		//	replicate->request[node] = NULL;
+		//	replicate->last_request[node] = NULL;
 		//}
 		//}
+		//replicate->load_request = NULL;
 
 
 		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
 		/* Assuming being used for SCRATCH for now, patched when entering REDUX mode */
 		replicate->relaxed_coherency = 1;
 		replicate->relaxed_coherency = 1;
@@ -785,7 +786,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	}
 	}
 	if (valid)
 	if (valid)
 	{
 	{
-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
 	}
@@ -1033,6 +1034,7 @@ retry_busy:
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
 		struct _starpu_data_replicate *local = &handle->per_node[node];
 		struct _starpu_data_replicate *local = &handle->per_node[node];
+		STARPU_ASSERT(!local->refcnt);
 		if (local->allocated)
 		if (local->allocated)
 		{
 		{
 			_starpu_data_unregister_ram_pointer(handle, node);
 			_starpu_data_unregister_ram_pointer(handle, node);
@@ -1049,6 +1051,7 @@ retry_busy:
 		for (worker = 0; worker < nworkers; worker++)
 		for (worker = 0; worker < nworkers; worker++)
 		{
 		{
 			struct _starpu_data_replicate *local = &handle->per_worker[worker];
 			struct _starpu_data_replicate *local = &handle->per_worker[worker];
+			STARPU_ASSERT(!local->refcnt);
 			/* free the data copy in a lazy fashion */
 			/* free the data copy in a lazy fashion */
 			if (local->allocated && local->automatically_allocated)
 			if (local->allocated && local->automatically_allocated)
 				_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);
 				_starpu_request_mem_chunk_removal(handle, local, starpu_worker_get_memory_node(worker), size);

+ 10 - 0
src/datawizard/malloc.c

@@ -149,6 +149,15 @@ static int _starpu_malloc_should_pin(int flags)
 	return 0;
 	return 0;
 }
 }
 
 
+int _starpu_malloc_willpin_on_node(unsigned dst_node)
+{
+	int flags = malloc_on_node_default_flags[dst_node];
+	return (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0
+			&& (_starpu_can_submit_cuda_task()
+			    /* || _starpu_can_submit_opencl_task() */
+			));
+}
+
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
 {
 {
 	int ret=0;
 	int ret=0;
@@ -185,6 +194,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		goto end;
 		goto end;
 	}
 	}
 
 
+	/* Note: synchronize this test with _starpu_malloc_willpin_on_node */
 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 	{
 		if (_starpu_can_submit_cuda_task())
 		if (_starpu_can_submit_cuda_task())

+ 7 - 0
src/datawizard/malloc.h

@@ -26,4 +26,11 @@ void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
 
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
+
+/**
+   Returns whether when allocating data on \p dst_node, we will do pinning, i.e.
+   the allocation will be very expensive, and should thus be moved out from the
+   critical path
+  */
+int _starpu_malloc_willpin_on_node(unsigned dst_node);
 #endif
 #endif

+ 21 - 11
src/datawizard/memalloc.c

@@ -169,7 +169,10 @@ void _starpu_mem_chunk_disk_register(unsigned disk_memnode)
 	{
 	{
 		enum starpu_node_kind kind = starpu_node_get_kind(i);
 		enum starpu_node_kind kind = starpu_node_get_kind(i);
 		if (kind == STARPU_CPU_RAM)
 		if (kind == STARPU_CPU_RAM)
+		{
+			STARPU_HG_DISABLE_CHECKING(evictable[i]);
 			evictable[i] = 1;
 			evictable[i] = 1;
+		}
 	}
 	}
 }
 }
 
 
@@ -327,7 +330,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 		{
 		{
 			/* This is the only copy, push it to destination */
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
 			struct _starpu_data_request *r;
-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
 			/* There is no way we don't need a request, since
 			/* There is no way we don't need a request, since
 			 * source is OWNER, destination can't be having it */
 			 * source is OWNER, destination can't be having it */
 			STARPU_ASSERT(r);
 			STARPU_ASSERT(r);
@@ -552,8 +555,9 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 
 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch)
 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch)
 {
 {
+	STARPU_ASSERT(node < STARPU_MAXNODES);
 	/* This data should be written through to this node, avoid dropping it! */
 	/* This data should be written through to this node, avoid dropping it! */
-	if (handle->wt_mask & (1<<node))
+	if (node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
 		return 0;
 		return 0;
 
 
 	/* This data was registered from this node, we will not be able to drop it anyway */
 	/* This data was registered from this node, we will not be able to drop it anyway */
@@ -1012,7 +1016,7 @@ restart2:
 				next_mc->remove_notify = &next_mc;
 				next_mc->remove_notify = &next_mc;
 			}
 			}
 			/* Note: this may unlock mc_list! */
 			/* Note: this may unlock mc_list! */
-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, is_prefetch);
 
 
 			if (orig_next_mc)
 			if (orig_next_mc)
 			{
 			{
@@ -1179,7 +1183,7 @@ void starpu_memchunk_tidy(unsigned node)
 			if (
 			if (
 				/* This data should be written through to this node, avoid
 				/* This data should be written through to this node, avoid
 				 * dropping it! */
 				 * dropping it! */
-				handle->wt_mask & (1<<node)
+				(node < sizeof(handle->wt_mask) * 8 && handle->wt_mask & (1<<node))
 				/* This is partitioned, don't care about the
 				/* This is partitioned, don't care about the
 				 * whole data, we'll work on the subdatas.  */
 				 * whole data, we'll work on the subdatas.  */
 			     || handle->nchildren
 			     || handle->nchildren
@@ -1231,7 +1235,7 @@ void starpu_memchunk_tidy(unsigned node)
 			}
 			}
 
 
 			_starpu_spin_unlock(&mc_lock[node]);
 			_starpu_spin_unlock(&mc_lock[node]);
-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, NULL, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
 			{
 			{
 				/* No request was actually needed??
 				/* No request was actually needed??
 				 * Odd, but cope with it.  */
 				 * Odd, but cope with it.  */
@@ -1442,7 +1446,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  *
  */
  */
 
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 {
 	unsigned attempts = 0;
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
 	starpu_ssize_t allocated_memory;
@@ -1473,6 +1477,12 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	if (!prefetch_oom)
 	if (!prefetch_oom)
 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
 		_STARPU_TRACE_END_ALLOC_REUSE(dst_node, handle, 0);
 #endif
 #endif
+
+	/* If this is RAM and pinned this will be slow
+	   In case we only want fast allocations return here */
+	if(only_fast_alloc && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM && _starpu_malloc_willpin_on_node(dst_node))
+		return -ENOMEM;
+
 	STARPU_ASSERT(handle->ops);
 	STARPU_ASSERT(handle->ops);
 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
 	STARPU_ASSERT(handle->ops->allocate_data_on_node);
 	STARPU_ASSERT(replicate->data_interface);
 	STARPU_ASSERT(replicate->data_interface);
@@ -1576,7 +1586,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 	{
 		cpt++;
 		cpt++;
-		_starpu_datawizard_progress(0);
+		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
 	}
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
@@ -1620,7 +1630,7 @@ out:
 	return allocated_memory;
 	return allocated_memory;
 }
 }
 
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc)
 {
 {
 	starpu_ssize_t allocated_memory;
 	starpu_ssize_t allocated_memory;
 
 
@@ -1635,7 +1645,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 		return 0;
 		return 0;
 
 
 	STARPU_ASSERT(replicate->data_interface);
 	STARPU_ASSERT(replicate->data_interface);
-	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch);
+	allocated_memory = _starpu_allocate_interface(handle, replicate, dst_node, is_prefetch, only_fast_alloc);
 
 
 	/* perhaps we could really not handle that capacity misses */
 	/* perhaps we could really not handle that capacity misses */
 	if (allocated_memory == -ENOMEM)
 	if (allocated_memory == -ENOMEM)
@@ -1845,7 +1855,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			for (i=0; i<nb_numa_nodes; i++)
 			{
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 				{
 					target = i;
 					target = i;
@@ -1877,7 +1887,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
 			for (i=0; i<nb_numa_nodes; i++)
 			for (i=0; i<nb_numa_nodes; i++)
 			{
 			{
-				if (handle->per_node[i].allocated || 
+				if (handle->per_node[i].allocated ||
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
 				{
 				{
 					target = i;
 					target = i;

+ 1 - 1
src/datawizard/memalloc.h

@@ -83,7 +83,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_is_prefetch is_prefetch, int only_fast_alloc);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 3 - 4
src/datawizard/memory_nodes.c

@@ -151,6 +151,7 @@ void _starpu_memory_node_register_condition(struct _starpu_worker *worker, starp
 #undef starpu_worker_get_memory_node
 #undef starpu_worker_get_memory_node
 unsigned starpu_worker_get_memory_node(unsigned workerid)
 unsigned starpu_worker_get_memory_node(unsigned workerid)
 {
 {
+	(void) workerid;
 	return _starpu_worker_get_memory_node(workerid);
 	return _starpu_worker_get_memory_node(workerid);
 }
 }
 
 
@@ -167,12 +168,10 @@ void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned m
 	}
 	}
 }
 }
 
 
+#undef starpu_worker_get_local_memory_node
 unsigned starpu_worker_get_local_memory_node(void)
 unsigned starpu_worker_get_local_memory_node(void)
 {
 {
-	struct _starpu_worker *worker = _starpu_get_local_worker_key();
-	if (!worker)
-		return STARPU_MAIN_RAM;
-	return worker->memory_node;
+	return _starpu_worker_get_local_memory_node();
 }
 }
 
 
 int starpu_memory_node_get_devid(unsigned node)
 int starpu_memory_node_get_devid(unsigned node)

+ 21 - 0
src/datawizard/memory_nodes.h

@@ -117,12 +117,19 @@ static inline enum starpu_node_kind _starpu_node_get_kind(unsigned node)
 }
 }
 #define starpu_node_get_kind _starpu_node_get_kind
 #define starpu_node_get_kind _starpu_node_get_kind
 
 
+#if STARPU_MAXNODES == 1
+#define _starpu_memory_nodes_get_count() 1
+#else
 static inline unsigned _starpu_memory_nodes_get_count(void)
 static inline unsigned _starpu_memory_nodes_get_count(void)
 {
 {
 	return _starpu_descr.nnodes;
 	return _starpu_descr.nnodes;
 }
 }
+#endif
 #define starpu_memory_nodes_get_count _starpu_memory_nodes_get_count
 #define starpu_memory_nodes_get_count _starpu_memory_nodes_get_count
 
 
+#if STARPU_MAXNODES == 1
+#define _starpu_worker_get_memory_node(workerid) 0
+#else
 static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 {
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
@@ -139,6 +146,20 @@ static inline unsigned _starpu_worker_get_memory_node(unsigned workerid)
 	return config->combined_workers[workerid - nworkers].memory_node;
 	return config->combined_workers[workerid - nworkers].memory_node;
 
 
 }
 }
+#endif
 #define starpu_worker_get_memory_node _starpu_worker_get_memory_node
 #define starpu_worker_get_memory_node _starpu_worker_get_memory_node
 
 
+#if STARPU_MAXNODES == 1
+#define _starpu_worker_get_local_memory_node() 0
+#else
+static inline unsigned _starpu_worker_get_local_memory_node(void)
+{
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker)
+		return STARPU_MAIN_RAM;
+	return worker->memory_node;
+}
+#endif
+#define starpu_worker_get_local_memory_node _starpu_worker_get_local_memory_node
+
 #endif // __MEMORY_NODES_H__
 #endif // __MEMORY_NODES_H__

+ 11 - 2
src/datawizard/reduction.c

@@ -280,12 +280,21 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 					redux_task->cl = handle->redux_cl;
 					redux_task->cl = handle->redux_cl;
 					STARPU_ASSERT(redux_task->cl);
 					STARPU_ASSERT(redux_task->cl);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0)))
-						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
+						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW|STARPU_COMMUTE, 0);
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
 					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1)))
 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 						STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
 
-					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
+					STARPU_ASSERT_MSG((STARPU_CODELET_GET_MODE(redux_task->cl, 0) & ~STARPU_COMMUTE) == STARPU_RW, "First parameter of reduction codelet %p has to be RW", redux_task->cl);
 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
 					STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet %p has to be R", redux_task->cl);
+					if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0) & STARPU_COMMUTE))
+					{
+						static int warned;
+						if (!warned)
+						{
+							warned = 1;
+							_STARPU_DISP("Warning: for reductions, codelet %p should have STARPU_COMMUTE along STARPU_RW\n", redux_task->cl);
+						}
+					}
 
 
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i], 0);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);
 					STARPU_TASK_SET_HANDLE(redux_task, replicate_array[i+step], 1);

+ 11 - 8
src/datawizard/user_interactions.c

@@ -53,7 +53,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
 
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, NULL, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
 
 
 	/* we do not increase the refcnt associated to the request since we are
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
 	 * not waiting for its termination */
@@ -126,7 +126,7 @@ static inline void _starpu_data_acquire_launch_fetch(struct user_interaction_wra
 	starpu_data_handle_t handle = wrapper->handle;
 	starpu_data_handle_t handle = wrapper->handle;
 	struct _starpu_data_replicate *replicate = node >= 0 ? &handle->per_node[node] : NULL;
 	struct _starpu_data_replicate *replicate = node >= 0 ? &handle->per_node[node] : NULL;
 
 
-	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
+	int ret = _starpu_fetch_data_on_node(handle, node, replicate, wrapper->mode, wrapper->detached, NULL, wrapper->prefetch, async, callback, callback_arg, wrapper->prio, "_starpu_data_acquire_launch_fetch");
 	STARPU_ASSERT(!ret);
 	STARPU_ASSERT(!ret);
 }
 }
 
 
@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 							  void (*callback)(void *arg),
 							  void (*callback)(void *arg),
 							  void *arg,
 							  void *arg,
 							  int sequential_consistency, int quick,
 							  int sequential_consistency, int quick,
-							  long *pre_sync_jobid, long *post_sync_jobid)
+							  long *pre_sync_jobid, long *post_sync_jobid, int prio)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
@@ -211,6 +211,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 	wrapper->callback_arg = arg;
 	wrapper->callback_arg = arg;
 	wrapper->pre_sync_task = NULL;
 	wrapper->pre_sync_task = NULL;
 	wrapper->post_sync_task = NULL;
 	wrapper->post_sync_task = NULL;
+	wrapper->prio = prio;
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int handle_sequential_consistency = handle->sequential_consistency;
 	int handle_sequential_consistency = handle->sequential_consistency;
@@ -225,6 +226,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
 		wrapper->pre_sync_task->callback_func = starpu_data_acquire_cb_pre_sync_callback;
 		wrapper->pre_sync_task->callback_arg = wrapper;
 		wrapper->pre_sync_task->callback_arg = wrapper;
 		wrapper->pre_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
 		wrapper->pre_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
+		wrapper->pre_sync_task->priority = prio;
 		pre_sync_job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
 		pre_sync_job = _starpu_get_job_associated_to_task(wrapper->pre_sync_task);
 		if (pre_sync_jobid)
 		if (pre_sync_jobid)
 			*pre_sync_jobid = pre_sync_job->job_id;
 			*pre_sync_jobid = pre_sync_job->job_id;
@@ -233,6 +235,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
 		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
 		wrapper->post_sync_task->detach = 1;
 		wrapper->post_sync_task->detach = 1;
 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
+		wrapper->post_sync_task->priority = prio;
 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
 		if (post_sync_jobid)
 		if (post_sync_jobid)
 			*post_sync_jobid = post_sync_job->job_id;
 			*post_sync_jobid = post_sync_job->job_id;
@@ -280,7 +283,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
 							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
 							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
 							  int sequential_consistency, int quick)
 							  int sequential_consistency, int quick)
 {
 {
-	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL);
+	return starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(handle, node, mode, NULL, callback, arg, sequential_consistency, quick, NULL, NULL, STARPU_DEFAULT_PRIO);
 }
 }
 
 
 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node,
 int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node,
@@ -616,7 +619,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
 
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, STARPU_DEFAULT_PRIO);
 }
 }
 
 
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
@@ -626,7 +629,7 @@ int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node
 
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
 {
-	return starpu_data_prefetch_on_node_prio(handle, node, async, 0);
+	return starpu_data_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
 }
 }
 
 
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
@@ -636,7 +639,7 @@ int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned
 
 
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
 {
-	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, 0);
+	return starpu_data_idle_prefetch_on_node_prio(handle, node, async, STARPU_DEFAULT_PRIO);
 }
 }
 
 
 static void _starpu_data_wont_use(void *data)
 static void _starpu_data_wont_use(void *data)
@@ -817,7 +820,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 		unsigned node;
 		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
 		{
-			if (handle->per_node[memory_node].requested & (1UL << node))
+			if (handle->per_node[memory_node].request[node])
 			{
 			{
 				requested = 1;
 				requested = 1;
 				break;
 				break;

+ 2 - 2
src/datawizard/write_back.c

@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 				{
 				{
 					cpt++;
 					cpt++;
-					__starpu_datawizard_progress(1, 1);
+					__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 				}
 				}
 				if (cpt == STARPU_SPIN_MAXTRY)
 				if (cpt == STARPU_SPIN_MAXTRY)
 					_starpu_spin_lock(&handle->header_lock);
 					_starpu_spin_lock(&handle->header_lock);
@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 
 				struct _starpu_data_request *r;
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, NULL, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 
 			        /* If no request was created, the handle was already up-to-date on the
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */
 			         * node */

+ 2 - 2
src/debug/latency.c

@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
 
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, NULL, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}
 	}

+ 51 - 42
src/debug/traces/starpu_fxt.c

@@ -251,11 +251,12 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "Modes:");
 		fprintf(tasks_file, "Modes:");
 		for (i = 0; i < task->ndata; i++)
 		for (i = 0; i < task->ndata; i++)
-			fprintf(tasks_file, " %s%s%s%s%s",
+			fprintf(tasks_file, " %s%s%s%s%s%s",
 				(task->data[i].mode & STARPU_R)?"R":"",
 				(task->data[i].mode & STARPU_R)?"R":"",
 				(task->data[i].mode & STARPU_W)?"W":"",
 				(task->data[i].mode & STARPU_W)?"W":"",
 				(task->data[i].mode & STARPU_SCRATCH)?"S":"",
 				(task->data[i].mode & STARPU_SCRATCH)?"S":"",
 				(task->data[i].mode & STARPU_REDUX)?"X":"",
 				(task->data[i].mode & STARPU_REDUX)?"X":"",
+				(task->data[i].mode & STARPU_MPI_REDUX)?"X-mpi":"",
 				(task->data[i].mode & STARPU_COMMUTE)?"C":"");
 				(task->data[i].mode & STARPU_COMMUTE)?"C":"");
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "\n");
 		fprintf(tasks_file, "Sizes:");
 		fprintf(tasks_file, "Sizes:");
@@ -763,15 +764,20 @@ static void memnode_pop_state(double time, const char *prefix, unsigned int memn
 #endif
 #endif
 }
 }
 
 
-static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long info, unsigned long size, unsigned int dest, struct starpu_fxt_options *options)
+static void memnode_event(double time, const char *prefix, unsigned int memnodeid, const char *name, unsigned long handle, unsigned long value, unsigned long info, long size_prio, unsigned int dest, struct starpu_fxt_options *options)
 {
 {
 	if (!options->memory_states)
 	if (!options->memory_states)
 		return;
 		return;
+	// If there is not a valid memory node, we cant associate it
+	if((int)memnodeid < 0)
+		return;
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
 	char container[STARPU_POTI_STR_LEN];
 	char p_handle[STARPU_POTI_STR_LEN];
 	char p_handle[STARPU_POTI_STR_LEN];
+	char p_value[STARPU_POTI_STR_LEN];
 	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
 	memmanager_container_alias(container, STARPU_POTI_STR_LEN, prefix, memnodeid);
 	snprintf(p_handle, sizeof(p_handle), "%lx", handle);
 	snprintf(p_handle, sizeof(p_handle), "%lx", handle);
+	snprintf(p_value, sizeof(p_value), "%lx", value);
 
 
 #ifdef HAVE_POTI_USER_NEWEVENT
 #ifdef HAVE_POTI_USER_NEWEVENT
 	char p_dest[STARPU_POTI_STR_LEN];
 	char p_dest[STARPU_POTI_STR_LEN];
@@ -780,15 +786,15 @@ static void memnode_event(double time, const char *prefix, unsigned int memnodei
 
 
 	memmanager_container_alias(p_dest, STARPU_POTI_STR_LEN, prefix, dest);
 	memmanager_container_alias(p_dest, STARPU_POTI_STR_LEN, prefix, dest);
 	snprintf(p_info, sizeof(p_info), "%lu", info);
 	snprintf(p_info, sizeof(p_info), "%lu", info);
-	snprintf(p_size, sizeof(p_size), "%lu", size);
+	snprintf(p_size, sizeof(p_size), "%ld", size_prio);
 
 
-	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, "0", 4,
+	poti_user_NewEvent(_starpu_poti_MemoryEvent, time, container, name, p_value, 4,
 			   p_handle, p_info, p_size, p_dest);
 			   p_handle, p_info, p_size, p_dest);
 #else
 #else
 	poti_NewEvent(time, container, name, p_handle);
 	poti_NewEvent(time, container, name, p_handle);
 #endif
 #endif
 #else
 #else
-	fprintf(out_paje_file, "22    %.9f    %s %smm%u  0 %lx %lu %lu %smm%u\n", time, name, prefix, memnodeid, handle, info, size, prefix, dest);
+	fprintf(out_paje_file, "22    %.9f    %s %smm%u  %lx %lx %lu %ld %smm%u\n", time, name, prefix, memnodeid, value, handle, info, size_prio, prefix, dest);
 #endif
 #endif
 }
 }
 
 
@@ -2232,7 +2238,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 		{
 		{
 			double time = get_event_time_stamp(ev, options);
 			double time = get_event_time_stamp(ev, options);
 			memnode_push_state(time, prefix, dst, "Co");
 			memnode_push_state(time, prefix, dst, "Co");
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, comid, size, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCo", handle, 0, comid, size, src, options);
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_memnode_container[STARPU_POTI_STR_LEN];
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_memnode_container[STARPU_POTI_STR_LEN];
 			char program_container[STARPU_POTI_STR_LEN];
 			char program_container[STARPU_POTI_STR_LEN];
@@ -2351,7 +2357,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		{
 		{
 			double time = get_event_time_stamp(ev, options);
 			double time = get_event_time_stamp(ev, options);
 			memnode_pop_state(time, prefix, dst);
 			memnode_pop_state(time, prefix, dst);
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, comid, size, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoE", handle, 0, comid, size, src, options);
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
 			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
 			char dst_memnode_container[STARPU_POTI_STR_LEN], program_container[STARPU_POTI_STR_LEN];
 			char dst_memnode_container[STARPU_POTI_STR_LEN], program_container[STARPU_POTI_STR_LEN];
@@ -2378,7 +2384,7 @@ static void handle_start_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_f
 		if (out_paje_file)
 		if (out_paje_file)
 		{
 		{
 			memnode_push_state(get_event_time_stamp(ev, options), prefix, dst, "CoA");
 			memnode_push_state(get_event_time_stamp(ev, options), prefix, dst, "CoA");
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoA", 0, 0, 0, 0, src, options);
 		}
 		}
 
 
 }
 }
@@ -2394,7 +2400,7 @@ static void handle_end_driver_copy_async(struct fxt_ev_64 *ev, struct starpu_fxt
 		if (out_paje_file)
 		if (out_paje_file)
 		{
 		{
 			memnode_pop_state(get_event_time_stamp(ev, options), prefix, dst);
 			memnode_pop_state(get_event_time_stamp(ev, options), prefix, dst);
-			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, src, options);
+			memnode_event(get_event_time_stamp(ev, options), options->file_prefix, dst, "DCoAE", 0, 0, 0, 0, src, options);
 		}
 		}
 }
 }
 
 
@@ -2408,32 +2414,36 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 }
 }
 
 
+static void handle_data_request(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
+{
+	unsigned memnode = ev->param[0];
+	unsigned dest = ev->param[1];
+	unsigned prio = ev->param[2];
+	unsigned long handle = ev->param[3];
+	unsigned prefe = ev->param[4];
+	unsigned long request = ev->param[5];
+
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, request, prefe, prio, dest, options);
+}
+
 static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_memnode_event_start_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 {
 {
 	unsigned memnode = ev->param[0];
 	unsigned memnode = ev->param[0];
 	unsigned size = ev->param[2];
 	unsigned size = ev->param[2];
 	unsigned long handle = ev->param[3];
 	unsigned long handle = ev->param[3];
 
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, size, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, size, memnode, options);
 }
 }
 
 
 static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_memnode_event_start_4(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 {
 {
 	unsigned memnode = ev->param[0];
 	unsigned memnode = ev->param[0];
-	unsigned dest = ev->param[1];
-	if(strcmp(eventstr, "rc")==0)
-	{
-		//If it is a Request Create, use dest normally
-	}
-	else
-	{
-		dest = memnode;
-	}
+	//unsigned dest = ev->param[1]; // Not used
 	unsigned size = ev->param[2];
 	unsigned size = ev->param[2];
 	unsigned long handle = ev->param[3];
 	unsigned long handle = ev->param[3];
 	unsigned prefe = ev->param[4];
 	unsigned prefe = ev->param[4];
 
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, prefe, size, dest, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, prefe, size, memnode, options);
 }
 }
 
 
 static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2442,7 +2452,7 @@ static void handle_memnode_event_end_3(struct fxt_ev_64 *ev, struct starpu_fxt_o
 	unsigned long handle = ev->param[2];
 	unsigned long handle = ev->param[2];
 	unsigned info = ev->param[3];
 	unsigned info = ev->param[3];
 
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, info, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, info, 0, memnode, options);
 }
 }
 
 
 static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2450,7 +2460,7 @@ static void handle_memnode_event_start_2(struct fxt_ev_64 *ev, struct starpu_fxt
 	unsigned memnode = ev->param[0];
 	unsigned memnode = ev->param[0];
 	unsigned long handle = ev->param[2];
 	unsigned long handle = ev->param[2];
 
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
 }
 }
 
 
 static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -2458,7 +2468,7 @@ static void handle_memnode_event_end_2(struct fxt_ev_64 *ev, struct starpu_fxt_o
 	unsigned memnode = ev->param[0];
 	unsigned memnode = ev->param[0];
 	unsigned long handle = ev->param[2];
 	unsigned long handle = ev->param[2];
 
 
-	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, memnode, options);
+	memnode_event(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr, handle, 0, 0, 0, memnode, options);
 }
 }
 
 
 static void handle_push_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
 static void handle_push_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *eventstr)
@@ -3702,13 +3712,12 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				if (options->memory_states)
 				if (options->memory_states)
 					handle_data_state(&ev, options, "SS");
 					handle_data_state(&ev, options, "SS");
 				break;
 				break;
-                       case _STARPU_FUT_DATA_REQUEST_CREATED:
-                               if (!options->no_bus && options->memory_states)
-                               {
-                                       handle_memnode_event_start_4(&ev, options, "rc");
-                               }
-                               break;
-
+			case _STARPU_FUT_DATA_REQUEST_CREATED:
+				if (!options->no_bus && options->memory_states)
+				{
+					handle_data_request(&ev, options, "rc");
+				}
+				break;
 		  case _STARPU_FUT_PAPI_TASK_EVENT_VALUE:
 		  case _STARPU_FUT_PAPI_TASK_EVENT_VALUE:
 				handle_papi_event(&ev, options);
 				handle_papi_event(&ev, options);
 				break;
 				break;
@@ -4207,18 +4216,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		free(options->worker_archtypes[i].devices);
 		free(options->worker_archtypes[i].devices);
 
 
-	struct _starpu_symbol_name *itor, *next;
-	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
-		itor != _starpu_symbol_name_list_end(&symbol_list);
-		itor = next)
-	{
-		next = _starpu_symbol_name_list_next(itor);
-
-		_starpu_symbol_name_list_erase(&symbol_list, itor);
-		free(itor->name);
-		_starpu_symbol_name_delete(itor);
-	}
-
 	_starpu_fxt_component_deinit();
 	_starpu_fxt_component_deinit();
 
 
 	free_worker_ids();
 	free_worker_ids();
@@ -4608,6 +4605,17 @@ void _starpu_fxt_paje_file_init(struct starpu_fxt_options *options)
 static
 static
 void _starpu_fxt_paje_file_close(void)
 void _starpu_fxt_paje_file_close(void)
 {
 {
+	struct _starpu_symbol_name *itor, *next;
+	for (itor = _starpu_symbol_name_list_begin(&symbol_list);
+		itor != _starpu_symbol_name_list_end(&symbol_list);
+		itor = next)
+	{
+		next = _starpu_symbol_name_list_next(itor);
+
+		_starpu_symbol_name_list_erase(&symbol_list, itor);
+		free(itor->name);
+		_starpu_symbol_name_delete(itor);
+	}
 	if (out_paje_file)
 	if (out_paje_file)
 		fclose(out_paje_file);
 		fclose(out_paje_file);
 }
 }
@@ -4658,6 +4666,7 @@ uint64_t _starpu_fxt_find_start_time(char *filename_in)
 
 
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 {
 {
+	starpu_drivers_preinit();
 	_starpu_fxt_options_set_dir(options);
 	_starpu_fxt_options_set_dir(options);
 	_starpu_fxt_dag_init(options->dag_path);
 	_starpu_fxt_dag_init(options->dag_path);
 	_starpu_fxt_distrib_file_init(options);
 	_starpu_fxt_distrib_file_init(options);

+ 3 - 2
src/drivers/cpu/driver_cpu.c

@@ -40,6 +40,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 #include <core/task.h>
 #include <core/task.h>
 #include <core/disk.h>
 #include <core/disk.h>
@@ -341,7 +342,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 		return ret;
 		return ret;
 	}
 	}
 
 
-	res = __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 
 	if (!pending_task)
 	if (!pending_task)
 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
 		task = _starpu_get_worker_task(cpu_worker, workerid, memnode);
@@ -429,7 +430,7 @@ int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
 	_STARPU_TRACE_WORKER_DEINIT_START;
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
 
 	unsigned memnode = cpu_worker->memory_node;
 	unsigned memnode = cpu_worker->memory_node;
-	_starpu_handle_all_pending_node_data_requests(memnode);
+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 
 	/* In case there remains some memory that was automatically
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data
 	 * allocated by StarPU, we release it now. Note that data

+ 4 - 36
src/drivers/cuda/driver_cuda.c

@@ -37,6 +37,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/task.h>
 #include <core/task.h>
 #include <common/knobs.h>
 #include <common/knobs.h>
 
 
@@ -935,14 +936,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	if (!idle_tasks)
 	if (!idle_tasks)
 	{
 	{
 		/* No task ready yet, no better thing to do than waiting */
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 		return 0;
 	}
 	}
 #endif
 #endif
 
 
 	/* Something done, make some progress */
 	/* Something done, make some progress */
-	res = !idle_tasks || !idle_transfers;
-	res |= __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 
 	/* And pull tasks */
 	/* And pull tasks */
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
@@ -950,9 +950,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	if (!res)
 	if (!res)
 		starpu_pthread_wait_wait(&worker0->wait);
 		starpu_pthread_wait_wait(&worker0->wait);
-#else
-	if (!res)
-		return 0;
 #endif
 #endif
 
 
 	for (i = 0; i < (int) worker_set->nworkers; i++)
 	for (i = 0; i < (int) worker_set->nworkers; i++)
@@ -972,35 +969,6 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		{
 		{
 			/* this is neither a cuda or a cublas task */
 			/* this is neither a cuda or a cublas task */
 			_starpu_worker_refuse_task(worker, task);
 			_starpu_worker_refuse_task(worker, task);
-#if 0
-			if (worker->pipeline_length)
-			{
-				int j;
-				for (j = 0; j < worker->ntasks; j++)
-				{
-					const int j_mod = (j+worker->first_task)%STARPU_MAX_PIPELINE;
-					if (task == worker->current_tasks[j_mod])
-					{
-						worker->current_tasks[j_mod] = NULL;
-						if (j == 0)
-						{
-							worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
-							_starpu_set_current_task(NULL);
-						}
-						break;
-					}
-				}
-				STARPU_ASSERT(j<worker->ntasks);
-			}
-			else
-			{
-				worker->current_task = NULL;
-				_starpu_set_current_task(NULL);
-			}
-			worker->ntasks--;
-			int res = _starpu_push_task_to_workers(task);
-			STARPU_ASSERT_MSG(res == 0, "_starpu_push_task_to_workers() unexpectedly returned = %d\n", res);
-#endif
 			continue;
 			continue;
 		}
 		}
 
 
@@ -1039,7 +1007,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 		if (!usersleft)
 		if (!usersleft)
                 {
                 {
 			/* I'm last, deinitialize device */
 			/* I'm last, deinitialize device */
-			_starpu_handle_all_pending_node_data_requests(memnode);
+			_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 
 			/* In case there remains some memory that was automatically
 			/* In case there remains some memory that was automatically
 			 * allocated by StarPU, we release it now. Note that data
 			 * allocated by StarPU, we release it now. Note that data

+ 3 - 3
src/drivers/mp_common/source_common.c

@@ -978,7 +978,7 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 		}
 		}
 	}
 	}
 
 
-        res |= __starpu_datawizard_progress(1, 1);
+        res |= __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 
         /* Handle message which have been store */
         /* Handle message which have been store */
         _starpu_src_common_handle_stored_async(mp_node);
         _starpu_src_common_handle_stored_async(mp_node);
@@ -1075,7 +1075,7 @@ void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set, int
         for (device = 0; device < ndevices; device++)
         for (device = 0; device < ndevices; device++)
 	{
 	{
         	_STARPU_TRACE_END_PROGRESS(memnode[device]);
         	_STARPU_TRACE_END_PROGRESS(memnode[device]);
-                _starpu_handle_all_pending_node_data_requests(memnode[device]);
+                _starpu_datawizard_handle_all_pending_node_data_requests(memnode[device]);
 	}
 	}
 
 
         /* In case there remains some memory that was automatically
         /* In case there remains some memory that was automatically
@@ -1107,7 +1107,7 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, unsigned
 
 
         _STARPU_TRACE_END_PROGRESS(memnode);
         _STARPU_TRACE_END_PROGRESS(memnode);
 
 
-        _starpu_handle_all_pending_node_data_requests(memnode);
+        _starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 
         /* In case there remains some memory that was automatically
         /* In case there remains some memory that was automatically
          * allocated by StarPU, we release it now. Note that data
          * allocated by StarPU, we release it now. Note that data

+ 4 - 4
src/drivers/opencl/driver_opencl.c

@@ -31,6 +31,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <datawizard/malloc.h>
+#include <datawizard/datawizard.h>
 #include <core/task.h>
 #include <core/task.h>
 #include <common/knobs.h>
 #include <common/knobs.h>
 
 
@@ -787,13 +788,12 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 	if (!idle_tasks)
 	if (!idle_tasks)
 	{
 	{
 		/* No task ready yet, no better thing to do than waiting */
 		/* No task ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(1, !idle_transfers);
+		__starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
 		return 0;
 		return 0;
 	}
 	}
 #endif
 #endif
 
 
-	res = !idle_tasks || !idle_transfers;
-	res |= __starpu_datawizard_progress(1, 1);
+	res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
 
 
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 
 
@@ -840,7 +840,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 
 
 	unsigned memnode = worker->memory_node;
 	unsigned memnode = worker->memory_node;
 
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
+	_starpu_datawizard_handle_all_pending_node_data_requests(memnode);
 
 
 	/* In case there remains some memory that was automatically
 	/* In case there remains some memory that was automatically
 	 * allocated by StarPU, we release it now. Note that data
 	 * allocated by StarPU, we release it now. Note that data

+ 3 - 0
src/profiling/profiling.c

@@ -114,6 +114,9 @@ int starpu_profiling_status_set(int status)
 	{
 	{
 		struct _starpu_worker *worker_struct = _starpu_get_worker_struct(worker);
 		struct _starpu_worker *worker_struct = _starpu_get_worker_struct(worker);
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_struct->sched_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_struct->sched_mutex);
+	}
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[worker]);
 		STARPU_PTHREAD_MUTEX_LOCK(&worker_info_mutex[worker]);
 	}
 	}
 
 

+ 1 - 1
src/sched_policies/component_best_implementation.c

@@ -112,7 +112,7 @@ static struct starpu_task * best_implementation_pull_task(struct starpu_sched_co
 	}
 	}
 	if(task)
 	if(task)
 		/* this worker can execute this task as it was returned by a pop*/
 		/* this worker can execute this task as it was returned by a pop*/
-		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_worker_get_id_check());
+		(void)find_best_impl(component->tree->sched_ctx_id, task, starpu_bitmap_first(&component->workers_in_ctx));
 	return task;
 	return task;
 }
 }
 
 

+ 3 - 1
src/sched_policies/component_fifo.c

@@ -180,8 +180,10 @@ static struct starpu_task * fifo_pull_task(struct starpu_sched_component * compo
 	struct starpu_task * task;
 	struct starpu_task * task;
 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 	if (data->ready && to->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE)
 		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
 		task = _starpu_fifo_pop_first_ready_task(queue, starpu_bitmap_first(&to->workers_in_ctx), -1);
+	else if (to->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS)
+		task = _starpu_fifo_pop_task(queue, starpu_bitmap_first(&to->workers_in_ctx));
 	else
 	else
-		task = _starpu_fifo_pop_task(queue, starpu_worker_get_id_check());
+		task = _starpu_fifo_pop_task(queue, -1);
 	if(task && data->exp)
 	if(task && data->exp)
 	{
 	{
 		if(!isnan(task->predicted))
 		if(!isnan(task->predicted))

+ 3 - 3
src/sched_policies/component_worker.c

@@ -443,8 +443,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 		if(task)
 		if(task)
 		{
 		{
 			_starpu_worker_task_list_transfer_started(list, task);
 			_starpu_worker_task_list_transfer_started(list, task);
-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			starpu_push_task_end(task);
 			starpu_push_task_end(task);
+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			goto ret;
 			goto ret;
 		}
 		}
 		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
@@ -470,8 +470,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 			STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 			STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 			_starpu_worker_task_list_add(list, task);
 			_starpu_worker_task_list_add(list, task);
 			_starpu_worker_task_list_transfer_started(list, task);
 			_starpu_worker_task_list_transfer_started(list, task);
-			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			starpu_push_task_end(task);
 			starpu_push_task_end(task);
+			STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 			goto ret;
 			goto ret;
 		}
 		}
 		struct starpu_sched_component * combined_worker_component = starpu_sched_component_worker_get(component->tree->sched_ctx_id, workerid);
 		struct starpu_sched_component * combined_worker_component = starpu_sched_component_worker_get(component->tree->sched_ctx_id, workerid);
@@ -486,8 +486,8 @@ static struct starpu_task * simple_worker_pull_task(struct starpu_sched_componen
 		STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 		STARPU_COMPONENT_MUTEX_LOCK(&list->mutex);
 		_starpu_worker_task_list_add(list, task);
 		_starpu_worker_task_list_add(list, task);
 		_starpu_worker_task_list_transfer_started(list, task);
 		_starpu_worker_task_list_transfer_started(list, task);
-		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 		starpu_push_task_end(task);
 		starpu_push_task_end(task);
+		STARPU_COMPONENT_MUTEX_UNLOCK(&list->mutex);
 	}
 	}
 ret:
 ret:
 	return task;
 	return task;

+ 25 - 2
src/sched_policies/fifo_queues.c

@@ -352,6 +352,29 @@ int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_
 	return ((num_priorities-1)/(max-min)) * (priority - min);
 	return ((num_priorities-1)/(max-min)) * (priority - min);
 }
 }
 
 
+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker)
+{
+	size_t cnt = 0;
+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
+	unsigned index;
+
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle;
+		unsigned buffer_node = _starpu_task_data_get_node_on_worker(task, index, worker);
+
+		handle = STARPU_TASK_GET_HANDLE(task, index);
+
+		int is_valid;
+		starpu_data_query_status(handle, buffer_node, NULL, &is_valid, NULL);
+
+		if (!is_valid)
+			cnt+=starpu_data_get_size(handle);
+	}
+
+	return cnt;
+}
+
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker)
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker)
 {
 {
 	int cnt = 0;
 	int cnt = 0;
@@ -392,7 +415,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
 
 		int first_task_priority = task->priority;
 		int first_task_priority = task->priority;
 
 
-		int non_ready_best = INT_MAX;
+		size_t non_ready_best = SIZE_MAX;
 
 
 		for (current = task; current; current = current->next)
 		for (current = task; current; current = current->next)
 		{
 		{
@@ -400,7 +423,7 @@ struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq
 
 
 			if (priority >= first_task_priority)
 			if (priority >= first_task_priority)
 			{
 			{
-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
 				if (non_ready < non_ready_best)
 				if (non_ready < non_ready_best)
 				{
 				{
 					non_ready_best = non_ready;
 					non_ready_best = non_ready;

+ 1 - 0
src/sched_policies/fifo_queues.h

@@ -69,6 +69,7 @@ struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo)
 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, int workerid);
 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, int workerid);
 int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_id);
 int _starpu_normalize_prio(int priority, int num_priorities, unsigned sched_ctx_id);
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker);
 int _starpu_count_non_ready_buffers(struct starpu_task *task, unsigned worker);
+size_t _starpu_size_non_ready_buffers(struct starpu_task *task, unsigned worker);
 struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned workerid, int num_priorities);
 struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned workerid, int num_priorities);
 
 
 #endif // __FIFO_QUEUES_H__
 #endif // __FIFO_QUEUES_H__

+ 2 - 2
src/sched_policies/prio_deque.c

@@ -94,7 +94,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 			return NULL;
 			return NULL;
 
 
 		int first_task_priority = task->priority;
 		int first_task_priority = task->priority;
-		int non_ready_best = INT_MAX;
+		size_t non_ready_best = SIZE_MAX;
 
 
 		for (current = starpu_task_prio_list_begin(&pdeque->list);
 		for (current = starpu_task_prio_list_begin(&pdeque->list);
 		     current != starpu_task_prio_list_end(&pdeque->list);
 		     current != starpu_task_prio_list_end(&pdeque->list);
@@ -104,7 +104,7 @@ struct starpu_task *_starpu_prio_deque_deque_first_ready_task(struct _starpu_pri
 
 
 			if (priority >= first_task_priority)
 			if (priority >= first_task_priority)
 			{
 			{
-				int non_ready = _starpu_count_non_ready_buffers(current, workerid);
+				size_t non_ready = _starpu_size_non_ready_buffers(current, workerid);
 				if (non_ready < non_ready_best)
 				if (non_ready < non_ready_best)
 				{
 				{
 					non_ready_best = non_ready;
 					non_ready_best = non_ready;

+ 5 - 0
src/sched_policies/work_stealing_policy.c

@@ -610,6 +610,11 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	if (_starpu_worker_trylock(victim))
 	if (_starpu_worker_trylock(victim))
 	{
 	{
 		/* victim is busy, don't bother it, come back later */
 		/* victim is busy, don't bother it, come back later */
+#ifdef STARPU_SIMGRID
+		starpu_sleep(0.000001);
+		/* Make sure we come back and not block */
+		starpu_wake_worker_no_relax(workerid);
+#endif
 		return NULL;
 		return NULL;
 	}
 	}
 	if (ws->per_worker[victim].running && ws->per_worker[victim].queue.ntasks > 0)
 	if (ws->per_worker[victim].running && ws->per_worker[victim].queue.ntasks > 0)

+ 0 - 0
src/util/execute_on_all.c


Някои файлове не бяха показани, защото твърде много файлове са промени