Browse Source

Merge branch 'master' into fpga

Nathalie Furmento 5 years ago
parent
commit
536fe80f62
100 changed files with 1733 additions and 795 deletions
  1. 2 0
      ChangeLog
  2. 22 12
      configure.ac
  3. 14 1
      doc/doxygen/chapters/210_check_list_performance.doxy
  4. 5 1
      doc/doxygen/chapters/370_online_performance_tools.doxy
  5. 13 2
      doc/doxygen/chapters/470_simgrid.doxy
  6. 8 0
      doc/doxygen/chapters/501_environment_variables.doxy
  7. 12 4
      doc/doxygen/chapters/510_configure_options.doxy
  8. 1 0
      examples/Makefile.am
  9. 11 0
      examples/api/bcsr_data_interface.c
  10. 10 0
      examples/api/block_data_interface.c
  11. 2 0
      examples/api/coo_data_interface.c
  12. 9 0
      examples/api/csr_data_interface.c
  13. 10 0
      examples/api/matrix_data_interface.c
  14. 2 0
      examples/api/multiformat_data_interface.c
  15. 37 0
      examples/api/tensor_data_interface.c
  16. 5 0
      examples/api/variable_data_interface.c
  17. 8 0
      examples/api/vector_data_interface.c
  18. 2 0
      examples/api/void_data_interface.c
  19. 17 20
      examples/cholesky/cholesky_implicit.c
  20. 5 0
      examples/filters/fblock.c
  21. 2 2
      examples/filters/fblock_opencl.c
  22. 13 10
      examples/filters/fblock_opencl_kernel.cl
  23. 19 1
      include/starpu_data.h
  24. 10 16
      include/starpu_fxt.h
  25. 7 2
      min-dgels/Makefile.in
  26. 0 152
      min-dgels/additional/blaswrap.h
  27. 8 0
      min-dgels/base/F2CLIBS/libf2c/Makefile
  28. 0 152
      min-dgels/base/INCLUDE/blaswrap.h
  29. 5 3
      mpi/examples/Makefile.am
  30. 10 5
      mpi/examples/benchs/abstract_sendrecv_bench.c
  31. 17 12
      mpi/examples/benchs/burst_helper.c
  32. 10 5
      mpi/examples/benchs/gemm_helper.c
  33. 2 1
      mpi/examples/benchs/sendrecv_bench.c
  34. 10 6
      mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
  35. 1 1
      mpi/examples/filters/filter.c
  36. 149 41
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  37. 14 11
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  38. 23 3
      mpi/examples/mpi_lu/plu_example.c
  39. 13 0
      mpi/examples/mpi_lu/plu_implicit_example.c
  40. 26 11
      mpi/examples/mpi_lu/plu_outofcore_example.c
  41. 18 14
      mpi/src/mpi/starpu_mpi_mpi.c
  42. 7 4
      mpi/src/nmad/starpu_mpi_nmad.c
  43. 1 1
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
  44. 2 2
      mpi/src/starpu_mpi_coop_sends.c
  45. 1 1
      mpi/src/starpu_mpi_fxt.h
  46. 2 0
      mpi/src/starpu_mpi_private.c
  47. 2 1
      mpi/src/starpu_mpi_private.h
  48. 2 2
      mpi/tests/Makefile.am
  49. 1 1
      mpi/tests/broadcast.c
  50. 18 2
      mpi/tests/early_request.c
  51. 1 1
      mpi/tests/insert_task_compute.c
  52. 5 3
      mpi/tests/pingpong.c
  53. 11 0
      src/common/fxt.c
  54. 2 0
      src/common/utils.h
  55. 4 16
      src/core/dependencies/cg.c
  56. 31 9
      src/core/dependencies/data_arbiter_concurrency.c
  57. 29 11
      src/core/dependencies/data_concurrency.c
  58. 2 2
      src/core/dependencies/data_concurrency.h
  59. 14 4
      src/core/dependencies/implicit_data_deps.c
  60. 1 1
      src/core/dependencies/implicit_data_deps.h
  61. 16 11
      src/core/dependencies/tags.c
  62. 2 0
      src/core/dependencies/tags.h
  63. 1 1
      src/core/jobs.c
  64. 3 0
      src/core/perfmodel/multiple_regression.c
  65. 13 10
      src/datawizard/coherency.c
  66. 1 0
      src/datawizard/coherency.h
  67. 4 4
      src/datawizard/copy_driver.c
  68. 6 2
      src/datawizard/interfaces/data_interface.c
  69. 31 8
      src/datawizard/user_interactions.c
  70. 1 1
      src/datawizard/write_back.c
  71. 2 2
      src/debug/latency.c
  72. 19 13
      src/debug/traces/starpu_fxt.c
  73. 1 0
      src/debug/traces/starpu_fxt.h
  74. 8 6
      src/debug/traces/starpu_fxt_mpi.c
  75. 7 4
      src/debug/traces/starpu_paje.c
  76. 30 25
      src/sched_policies/component_work_stealing.c
  77. 68 22
      src/sched_policies/work_stealing_policy.c
  78. 1 2
      src/util/execute_on_all.c
  79. 1 0
      src/util/starpu_data_cpy.c
  80. 18 1
      tests/Makefile.am
  81. 214 0
      tests/datawizard/acquire_release_to.c
  82. 3 3
      tests/datawizard/bcsr.c
  83. 3 3
      tests/datawizard/interfaces/block/block_opencl.c
  84. 15 23
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  85. 1 1
      tests/datawizard/interfaces/tensor/tensor_interface.c
  86. 3 3
      tests/datawizard/interfaces/tensor/tensor_opencl.c
  87. 18 26
      tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
  88. 8 0
      tests/energy/energy_efficiency.c
  89. 4 4
      tests/helper/starpu_data_dup_ro.c
  90. 343 0
      tests/microbenchs/bandwidth.c
  91. 75 0
      tests/microbenchs/bandwidth_scheds.sh
  92. 1 0
      tests/overlap/overlap.c
  93. 1 1
      tests/parallel_tasks/explicit_combined_worker.c
  94. 1 0
      tools/Makefile.am
  95. 23 0
      tools/dev/valgrind/glpk.suppr
  96. 1 2
      tools/dev/valgrind/libc.suppr
  97. 1 1
      tools/dev/valgrind/padico.suppr
  98. 95 65
      tools/gdbinit
  99. 2 2
      tools/starpu_fxt_tool.c
  100. 0 0
      tools/starpu_perfmodel_recdump.c

+ 2 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
   * Add a task prefetch level, to improve retaining data in accelerators so we
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
   * Add starpu_data_dup_ro().
+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
 
 
 Small changes:
 Small changes:
   * Add a synthetic energy efficiency testcase.
   * Add a synthetic energy efficiency testcase.
@@ -51,6 +52,7 @@ StarPU 1.3.5 (git revision xxx)
 
 
 Small changes:
 Small changes:
   * Move MPI cache functions into the public API
   * Move MPI cache functions into the public API
+  * Add STARPU_MPI_NOBIND environment variable.
 
 
 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
 ====================================================================
 ====================================================================

+ 22 - 12
configure.ac

@@ -249,6 +249,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 	], [simgrid_lib_dir=no])
 	], [simgrid_lib_dir=no])
 
 
 if test x$enable_simgrid = xyes ; then
 if test x$enable_simgrid = xyes ; then
+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
+
    	if test -n "$SIMGRID_CFLAGS" ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
@@ -836,7 +838,10 @@ if test x"$enable_native_winthreads" = xyes ; then
 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
 		AC_MSG_ERROR([pthread_create unavailable]))
 		AC_MSG_ERROR([pthread_create unavailable]))
 else
 else
-    AC_CHECK_LIB([pthread], [pthread_create])
+    AC_CHECK_LIB([pthread], [pthread_create], [
+        LIBS="$LIBS -lpthread"
+        STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lpthread"
+    ])
 fi
 fi
 
 
 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
@@ -957,8 +962,8 @@ AC_CHECK_FUNCS([mkdtemp])
 
 
 AC_CHECK_FUNCS([pread pwrite])
 AC_CHECK_FUNCS([pread pwrite])
 
 
-AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
-                    enable_hdf5=$enableval, enable_hdf5=maybe)
+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--enable-hdf5], [disable HDF5 support])],
+                    enable_hdf5=$enableval, enable_hdf5=no)
 
 
 if test "x$enable_hdf5" != xno ; then
 if test "x$enable_hdf5" != xno ; then
 	AC_ARG_WITH(hdf5-include-dir,
 	AC_ARG_WITH(hdf5-include-dir,
@@ -1017,8 +1022,11 @@ fi
 
 
 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
+	enable_hdf5=yes
+else
+	enable_hdf5=no
 fi
 fi
-AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
+AM_CONDITIONAL(STARPU_HAVE_HDF5, test "x$enable_hdf5" = "xyes")
 
 
 
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
@@ -2523,8 +2531,8 @@ AC_SUBST(STARPU_EXPORT_DYNAMIC)
 # Computes the maximum number of different kernels a message-passing sink
 # Computes the maximum number of different kernels a message-passing sink
 # can lookup for and launch.
 # can lookup for and launch.
 AC_MSG_CHECKING(Maximum number of message-passing kernels)
 AC_MSG_CHECKING(Maximum number of message-passing kernels)
-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
-	      -enable-maxmpkernels=<number>],
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING(
+	      [-enable-maxmpkernels=<number>],
 	      [maximum number of kernels a message-passing sink can lookup
 	      [maximum number of kernels a message-passing sink can lookup
 	      for and execute])],
 	      for and execute])],
 	      maxmpkernels=$enableval, maxmpkernels=10)
 	      maxmpkernels=$enableval, maxmpkernels=10)
@@ -3111,6 +3119,9 @@ fi
 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
 			[Disable multiple linear regression models])],
 			[Disable multiple linear regression models])],
 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
+AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
+			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
+			enable_mlr_blas=$enableval, enable_mlr_blas=no)
 
 
 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
@@ -3121,11 +3132,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 	if test x$blas_lib = xnone ; then
 	if test x$blas_lib = xnone ; then
 	   use_system_lapack=no
 	   use_system_lapack=no
 	fi
 	fi
-	if test x$use_system_lapack = xyes; then
+	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 		LDFLAGS="-llapack $LDFLAGS"
 		LDFLAGS="-llapack $LDFLAGS"
 	else
 	else
-		if test x$blas_lib = xmkl; then
+		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
 		else
 		else
 			AC_MSG_CHECKING(whether min-dgels is linked)
 			AC_MSG_CHECKING(whether min-dgels is linked)
@@ -3142,9 +3153,6 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 					install_min_dgels=no
 					install_min_dgels=no
 					support_mlr=no
 					support_mlr=no
 				else
 				else
-					if test ! -d $PWD/min-dgels; then
-						cp -r $srcdir/min-dgels $PWD/
-					fi
 					AC_MSG_RESULT(yes)
 					AC_MSG_RESULT(yes)
 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
@@ -3590,7 +3598,7 @@ if test "x$enable_shared" = xno; then
         # No .so, so application will unexpected have to know which -l to
         # No .so, so application will unexpected have to know which -l to
         # use. Give them in .pc file.
         # use. Give them in .pc file.
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
-	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
+	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
 fi
 fi
 AC_SUBST(STARPU_EXPORTED_LIBS)
 AC_SUBST(STARPU_EXPORTED_LIBS)
 
 
@@ -3631,6 +3639,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/bandwidth_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/bandwidth_scheds.sh tests/microbenchs/
   mkdir -p tests/energy
   mkdir -p tests/energy
   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
@@ -3806,6 +3815,7 @@ AC_MSG_NOTICE([
                Scheduler Hypervisor:                          $build_sc_hypervisor
                Scheduler Hypervisor:                          $build_sc_hypervisor
                simgrid enabled:                               $enable_simgrid
                simgrid enabled:                               $enable_simgrid
                ayudame enabled:                               $ayu_msg
                ayudame enabled:                               $ayu_msg
+               HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr
 	       Support for multiple linear regression models: $support_mlr

+ 14 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -352,7 +352,20 @@ use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necess
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
 made on each architecture, to avoid bad scheduling decisions just because the
 made on each architecture, to avoid bad scheduling decisions just because the
-first measurements were not so good. Details on the current performance model status
+first measurements were not so good.
+
+Note that StarPU will not record the very first measurement for a given codelet
+and a given size, because it would most often be hit by computation library
+loading or initialization. StarPU will also throw measurements away if it
+notices that after computing an average execution time, it notices that most
+subsequent tasks have an execution time largely outside the computed average
+("Too big deviation for model..." warning messages). By looking at the details
+of the message and their reported measurements, it can highlight that your
+computation library really has non-stable measurements, which is probably an
+indication of an issue in the computation library, or the execution environment
+(e.g. rogue daemons).
+
+Details on the current performance model status
 can be obtained with the tool <c>starpu_perfmodel_display</c>: the
 can be obtained with the tool <c>starpu_perfmodel_display</c>: the
 option <c>-l</c> lists the available performance models, and the
 option <c>-l</c> lists the available performance models, and the
 option <c>-s</c> allows to choose the performance model to be
 option <c>-s</c> allows to choose the performance model to be

+ 5 - 1
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -375,7 +375,11 @@ parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
 directory. These files are reused when \ref STARPU_CALIBRATE
 directory. These files are reused when \ref STARPU_CALIBRATE
 environment variable is set to <c>1</c>, to recompute coefficients
 environment variable is set to <c>1</c>, to recompute coefficients
 based on the current, but also on the previous
 based on the current, but also on the previous
-executions. Additionally, when multiple linear regression models are
+executions. By default StarPU uses a lightweight dgels implementation, but the
+\ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
+used to make StarPU use a system-provided dgels BLAS.
+
+Additionally, when multiple linear regression models are
 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
 <c>model->combinations</c> are not defined, StarPU will still write
 <c>model->combinations</c> are not defined, StarPU will still write
 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow
 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow

+ 13 - 2
doc/doxygen/chapters/470_simgrid.doxy

@@ -167,8 +167,12 @@ theory results), see the \ref STARPU_SIMGRID_TRANSFER_COST, \ref STARPU_SIMGRID_
 
 
 \section SimulationMPIApplications MPI Applications
 \section SimulationMPIApplications MPI Applications
 
 
-StarPU-MPI applications can also be run in SimGrid mode. It needs to be compiled
-with \c smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
+StarPU-MPI applications can also be run in SimGrid mode. smpi currently requires
+that StarPU be build statically only, so <c>--disable-shared</c> needs to be
+passed to <c>./configure</c>.
+
+The application needs to be compiled with \c smpicc, and run using the
+<c>starpu_smpirun</c> script, for instance:
 
 
 \verbatim
 \verbatim
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
@@ -182,6 +186,13 @@ in case of a heterogeneous platform, it is possible to use the
 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 
+So as to use FxT traces, libfxt also needs to be built statically, <b>and</b>
+with dynamic linking flags, i.e. with
+
+\verbatim
+CFLAGS=-fPIC ./configure --enable-static
+\endverbatim
+
 \section SimulationDebuggingApplications Debugging Applications
 \section SimulationDebuggingApplications Debugging Applications
 
 
 By default, SimGrid uses its own implementation of threads, which prevents \c gdb
 By default, SimGrid uses its own implementation of threads, which prevents \c gdb

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -255,6 +255,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 workers.
 workers.
 </dd>
 </dd>
 
 
+<dt>STARPU_MPI_NOBIND</dt>
+<dd>
+\anchor STARPU_MPI_NOBIND
+\addindex __env__STARPU_MPI_NOBIND
+Setting it to non-zero will prevent StarPU from binding the MPI to
+a separate core. This is for instance useful when running the testsuite on a single system.
+</dd>
+
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dd>
 <dd>
 \anchor STARPU_WORKERS_CUDAID
 \anchor STARPU_WORKERS_CUDAID

+ 12 - 4
doc/doxygen/chapters/510_configure_options.doxy

@@ -571,11 +571,11 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 Enable linking with LevelDB if available
 Enable linking with LevelDB if available
 </dd>
 </dd>
 
 
-<dt>--disable-hdf5</dt>
+<dt>--enable-hdf5</dt>
 <dd>
 <dd>
-\anchor disable-hdf5
-\addindex __configure__--disable-hdf5
-Disable building HDF5 support.
+\anchor enable-hdf5
+\addindex __configure__--enable-hdf5
+Enable building HDF5 support.
 </dd>
 </dd>
 
 
 <dt>--with-hdf5-include-dir=<c>path</c></dt>
 <dt>--with-hdf5-include-dir=<c>path</c></dt>
@@ -768,6 +768,14 @@ this parameter is 10. Experimental.
 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
 </dd>
 </dd>
 
 
+<dt>--enable-mlr-system-blas</dt>
+<dd>
+\anchor enable-mlr-system-blas
+\addindex __configure__--enable-mlr-system-blas
+Allow to make multiple linear regression models use the system-provided BLAS for dgels
+(see \ref PerformanceModelExample)
+</dd>
+
 </dl>
 </dl>
 
 
 */
 */

+ 1 - 0
examples/Makefile.am

@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 	api/csr_data_interface			\
 	api/csr_data_interface			\
 	api/matrix_data_interface		\
 	api/matrix_data_interface		\
 	api/multiformat_data_interface		\
 	api/multiformat_data_interface		\
+	api/tensor_data_interface		\
 	api/variable_data_interface		\
 	api/variable_data_interface		\
 	api/vector_data_interface		\
 	api/vector_data_interface		\
 	api/void_data_interface
 	api/void_data_interface

+ 11 - 0
examples/api/bcsr_data_interface.c

@@ -17,6 +17,17 @@
 // This program checks that the implementation of the BCSR data
 // This program checks that the implementation of the BCSR data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 
 
 int main()
 int main()

+ 10 - 0
examples/api/block_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the block data
 // This program checks that the implementation of the block data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_block_ops my_starpu_interface_block_ops
+#define starpu_block_data_register my_starpu_block_data_register
+#define starpu_block_ptr_register my_starpu_block_ptr_register
+#define starpu_block_get_nx my_starpu_block_get_nx
+#define starpu_block_get_ny my_starpu_block_get_ny
+#define starpu_block_get_nz my_starpu_block_get_nz
+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
 #include "../../src/datawizard/interfaces/block_interface.c"
 #include "../../src/datawizard/interfaces/block_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/coo_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the COO data
 // This program checks that the implementation of the COO data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
+#define starpu_coo_data_register my_starpu_coo_data_register
 #include "../../src/datawizard/interfaces/coo_interface.c"
 #include "../../src/datawizard/interfaces/coo_interface.c"
 
 
 int main()
 int main()

+ 9 - 0
examples/api/csr_data_interface.c

@@ -17,6 +17,15 @@
 // This program checks that the implementation of the CSR data
 // This program checks that the implementation of the CSR data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
+#define starpu_csr_data_register my_starpu_csr_data_register
+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
+#define starpu_csr_get_elemsize my_starpu_csr_get_elemsize
+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
 #include "../../src/datawizard/interfaces/csr_interface.c"
 #include "../../src/datawizard/interfaces/csr_interface.c"
 
 
 int main()
 int main()

+ 10 - 0
examples/api/matrix_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the matrix data
 // This program checks that the implementation of the matrix data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
+#define starpu_matrix_data_register my_starpu_matrix_data_register
+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/multiformat_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the multiformat data
 // This program checks that the implementation of the multiformat data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 
 
 int main()
 int main()

+ 37 - 0
examples/api/tensor_data_interface.c

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+// This program checks that the implementation of the tensor data
+// interface only uses StarPU's public API
+
+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
+#define starpu_tensor_data_register my_starpu_tensor_data_register
+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
+#include "../../src/datawizard/interfaces/tensor_interface.c"
+
+int main()
+{
+        return 0;
+}

+ 5 - 0
examples/api/variable_data_interface.c

@@ -17,6 +17,11 @@
 // This program checks that the implementation of the variable data
 // This program checks that the implementation of the variable data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
+#define starpu_variable_data_register my_starpu_variable_data_register
+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
 #include "../../src/datawizard/interfaces/variable_interface.c"
 #include "../../src/datawizard/interfaces/variable_interface.c"
 
 
 int main()
 int main()

+ 8 - 0
examples/api/vector_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the vector data
 // This program checks that the implementation of the vector data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
+#define starpu_vector_data_register my_starpu_vector_data_register
+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
+#define starpu_vector_get_nx my_starpu_vector_get_nx
+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
 #include "../../src/datawizard/interfaces/vector_interface.c"
 #include "../../src/datawizard/interfaces/vector_interface.c"
 
 
 int main()
 int main()

+ 2 - 0
examples/api/void_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the void data
 // This program checks that the implementation of the void data
 // interface only uses StarPU's public API
 // interface only uses StarPU's public API
 
 
+#define starpu_interface_void_ops my_starpu_interface_void_ops
+#define starpu_void_data_register my_starpu_void_data_register
 #include "../../src/datawizard/interfaces/void_interface.c"
 #include "../../src/datawizard/interfaces/void_interface.c"
 
 
 int main()
 int main()

+ 17 - 20
examples/cholesky/cholesky_implicit.c

@@ -92,29 +92,26 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		}
 		}
 		starpu_data_wont_use(sdatakk);
 		starpu_data_wont_use(sdatakk);
 
 
-		for (m = k+1; m<nblocks; m++)
+		for (n = k+1; n<nblocks; n++)
 		{
 		{
-                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
-			for (n = k+1; n<nblocks; n++)
+                        starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
+			for (m = n; m<nblocks; m++)
 			{
 			{
-				if (n <= m)
-                                {
-					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
-					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
-
-					ret = starpu_task_insert(&cl22,
-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-								 STARPU_R, sdatamk,
-								 STARPU_R, sdatank,
-								 cl22.modes[2], sdatamn,
-								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
-								 STARPU_TAG_ONLY, TAG22(k,m,n),
-								 0);
-					if (ret == -ENODEV) return 77;
-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-                                }
+				starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
+				starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
+
+				ret = starpu_task_insert(&cl22,
+							 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							 STARPU_R, sdatamk,
+							 STARPU_R, sdatank,
+							 cl22.modes[2], sdatamn,
+							 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+							 STARPU_TAG_ONLY, TAG22(k,m,n),
+							 0);
+				if (ret == -ENODEV) return 77;
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 			}
 			}
-			starpu_data_wont_use(sdatamk);
+			starpu_data_wont_use(sdatank);
 		}
 		}
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}

+ 5 - 0
examples/filters/fblock.c

@@ -169,6 +169,11 @@ int main(void)
         print_data(handle);
         print_data(handle);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle);
 
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
         /* Print result block */
         /* Print result block */
         FPRINTF(stderr, "OUT Block\n");
         FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);
         print_block(block, NX, NY, NZ, NX, NX*NY);

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 
 	{
 	{
-		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		size_t global[3]={nx,ny,nz};
+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	}
 	starpu_opencl_release_kernel(kernel);
 	starpu_opencl_release_kernel(kernel);

+ 13 - 10
examples/filters/fblock_opencl_kernel.cl

@@ -18,14 +18,17 @@
 
 
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
 {
-        int i, j, k;
-        block = (__global char *)block + offset;
-        for(k=0; k<nz ; k++)
-	{
-                for(j=0; j<ny ; j++)
-		{
-                        for(i=0; i<nx ; i++)
-                                block[(k*ldz)+(j*ldy)+i] = factor;
-                }
-        }
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
+		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	block = (__global int*) ((__global char *)block + offset);
+	int i = idz*ldz + idy*ldy + idx;
+	block[i] = factor;
 }
 }

+ 19 - 1
include/starpu_data.h

@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
 
 /**
 /**
    Similar to starpu_data_release(), except that the data
    Similar to starpu_data_release(), except that the data
-   will be available on the given memory \p node instead of main memory.
+   was made available on the given memory \p node instead of main memory.
    The \p node parameter must be exactly the same as the corresponding \c
    The \p node parameter must be exactly the same as the corresponding \c
    starpu_data_acquire_on_node* call.
    starpu_data_acquire_on_node* call.
 */
 */
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 
 
+/**
+   Partly release the piece of data acquired by the application either by
+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
+   case this is equivalent to calling starpu_data_release().
+*/
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+
+/**
+   Similar to starpu_data_release_to(), except that the data
+   was made available on the given memory \p node instead of main memory.
+   The \p node parameter must be exactly the same as the corresponding \c
+   starpu_data_acquire_on_node* call.
+*/
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
+
 /** @} */
 /** @} */
 
 
 /**
 /**

+ 10 - 16
include/starpu_fxt.h

@@ -69,42 +69,36 @@ struct starpu_fxt_options
 	char *number_events_path;
 	char *number_events_path;
 	char *anim_path;
 	char *anim_path;
 	char *states_path;
 	char *states_path;
+	char worker_names[STARPU_NMAXWORKERS][256];
+	int nworkers;
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 
 
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
 	   MPI processes), we may need to prefix the name of the containers.
 	   MPI processes), we may need to prefix the name of the containers.
 	*/
 	*/
 	char *file_prefix;
 	char *file_prefix;
+
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the time offset with the rank 0.
 	*/
 	*/
 	uint64_t file_offset;
 	uint64_t file_offset;
+
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the MPI rank of the trace file.
 	*/
 	*/
 	int file_rank;
 	int file_rank;
 
 
 	/**
 	/**
-	   Output parameters
-	*/
-	char worker_names[STARPU_NMAXWORKERS][256];
-	/**
-	   Output parameters
-	*/
-	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
-	/**
-	   Output parameters
-	*/
-	int nworkers;
-
-	/**
 	   In case we want to dump the list of codelets to an external tool
 	   In case we want to dump the list of codelets to an external tool
 	*/
 	*/
 	struct starpu_fxt_codelet_event **dumped_codelets;
 	struct starpu_fxt_codelet_event **dumped_codelets;
+
 	/**
 	/**
-	   In case we want to dump the list of codelets to an external tool
+	   In case we want to dump the list of codelets to an external tool, number
+	   of dumped codelets.
 	*/
 	*/
 	long dumped_codelets_count;
 	long dumped_codelets_count;
 };
 };

+ 7 - 2
min-dgels/Makefile.in

@@ -7,13 +7,17 @@ ADDITIONAL=additional
 
 
 all:
 all:
 	mkdir -p build
 	mkdir -p build
-	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
+	[ -d "$(CLAPACK)" ] || ( cp -a $(srcdir)/$(CLAPACK) . ; chmod -R +rwX $(CLAPACK) )
 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
-	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
+	[ -d "$(ADDITIONAL)" ] || ( cp -a $(srcdir)/$(ADDITIONAL) . ; chmod -R +rwX $(ADDITIONAL) )
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 
 install:
 install:
+installcheck:
+uninstall:
+distuninstallcheck:
+dvi:
 
 
 clean:
 clean:
 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
@@ -21,6 +25,7 @@ clean:
 	rm -rf build *~
 	rm -rf build *~
 
 
 distclean: clean
 distclean: clean
+	[ -f Makefile.in ] || rm -fr $(CLAPACK) $(ADDITIONAL)
 
 
 # This part is needed by StarPU
 # This part is needed by StarPU
 
 

+ 0 - 152
min-dgels/additional/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 #define __BLASWRAP_H
 
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */
 #endif /* __BLASWRAP_H */

+ 8 - 0
min-dgels/base/F2CLIBS/libf2c/Makefile

@@ -175,6 +175,14 @@ xwsne.o:	fio.h
 xwsne.o:	lio.h
 xwsne.o:	lio.h
 xwsne.o:	fmt.h
 xwsne.o:	fmt.h
 
 
+main.o:		signal1.h
+signal_.o:	signal1.h
+s_paus.o:	signal1.h
+
+err.o:		sysdep1.h
+fio.h:		sysdep1.h
+util.c:		sysdep1.h
+
 arith.h: arithchk.c
 arith.h: arithchk.c
 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm
 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm

+ 0 - 152
min-dgels/base/INCLUDE/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 #define __BLASWRAP_H
 
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */
 #endif /* __BLASWRAP_H */

+ 5 - 3
mpi/examples/Makefile.am

@@ -47,10 +47,10 @@ endif
 endif
 endif
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
 
 
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK
@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=	\
 starpu_mpi_EXAMPLES	+=	\
 	mpi_lu/plu_implicit_example_float	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 endif
 endif
 
 
 mpi_lu_plu_example_float_LDADD =	\
 mpi_lu_plu_example_float_LDADD =	\

+ 10 - 5
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -22,25 +22,30 @@
 void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 {
 {
 	uint64_t iterations = LOOPS_DEFAULT;
 	uint64_t iterations = LOOPS_DEFAULT;
+	uint64_t s = 0;
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 
 	if (mpi_rank >= 2)
 	if (mpi_rank >= 2)
 	{
 	{
+		starpu_pause();
 		if (thread_barrier != NULL)
 		if (thread_barrier != NULL)
 		{
 		{
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
 		}
 		}
 
 
-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+		for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 		{
 		{
 			iterations = bench_nb_iterations(iterations, s);
 			iterations = bench_nb_iterations(iterations, s);
 
 
 			starpu_mpi_barrier(MPI_COMM_WORLD);
 			starpu_mpi_barrier(MPI_COMM_WORLD);
 
 
-			for (uint64_t j = 0; j < iterations; j++)
+			for (j = 0; j < iterations; j++)
 			{
 			{
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 			}
 			}
 		}
 		}
+		starpu_resume();
 
 
 		return;
 		return;
 	}
 	}
@@ -64,7 +69,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 	}
 	}
 
 
 	global_tstart = starpu_timing_now();
 	global_tstart = starpu_timing_now();
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 	{
 		vector_send = malloc(s);
 		vector_send = malloc(s);
 		vector_recv = malloc(s);
 		vector_recv = malloc(s);
@@ -78,7 +83,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
 
 		starpu_mpi_barrier(MPI_COMM_WORLD);
 		starpu_mpi_barrier(MPI_COMM_WORLD);
 
 
-		for (uint64_t j = 0; j < iterations; j++)
+		for (j = 0; j < iterations; j++)
 		{
 		{
 			if (mpi_rank == 0)
 			if (mpi_rank == 0)
 			{
 			{
@@ -111,7 +116,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 			const double d9_lat = lats[9 * (iterations - 1) / 10];
 			const double d9_lat = lats[9 * (iterations - 1) / 10];
 			double avg_lat = 0.0;
 			double avg_lat = 0.0;
 
 
-			for(uint64_t k = 0; k < iterations; k++)
+			for(k = 0; k < iterations; k++)
 			{
 			{
 				avg_lat += lats[k];
 				avg_lat += lats[k];
 			}
 			}

+ 17 - 12
mpi/examples/benchs/burst_helper.c

@@ -45,7 +45,8 @@ void burst_init_data(int rank)
 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
@@ -62,7 +63,8 @@ void burst_free_data(int rank)
 {
 {
 	if (rank == 0 || rank == 1)
 	if (rank == 0 || rank == 1)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			starpu_data_unregister(send_handles[i]);
 			starpu_data_unregister(send_handles[i]);
 			free(send_buffers[i]);
 			free(send_buffers[i]);
@@ -84,12 +86,13 @@ void burst_free_data(int rank)
 void burst_bidir(int rank)
 void burst_bidir(int rank)
 {
 {
 	int other_rank = (rank == 0) ? 1 : 0;
 	int other_rank = (rank == 0) ? 1 : 0;
+	int i = 0;
 
 
 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
 
 
 	if (rank == 0 || rank == 1)
 	if (rank == 0 || rank == 1)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			recv_reqs[i] = NULL;
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -100,13 +103,13 @@ void burst_bidir(int rank)
 
 
 	if (rank == 0 || rank == 1)
 	if (rank == 0 || rank == 1)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			send_reqs[i] = NULL;
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 		}
 
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -120,10 +123,11 @@ void burst_bidir(int rank)
 void burst_unidir(int sender, int receiver, int rank)
 void burst_unidir(int sender, int receiver, int rank)
 {
 {
 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
+	int i = 0;
 
 
 	if (rank == receiver)
 	if (rank == receiver)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			recv_reqs[i] = NULL;
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
@@ -134,7 +138,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 
 	if (rank == sender)
 	if (rank == sender)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			send_reqs[i] = NULL;
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
@@ -143,7 +147,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 
 	if (rank == sender || rank == receiver)
 	if (rank == sender || rank == receiver)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -160,12 +164,13 @@ void burst_bidir_half_postponed(int rank)
 {
 {
 	int other_rank = (rank == 0) ? 1 : 0;
 	int other_rank = (rank == 0) ? 1 : 0;
 	int received = 0;
 	int received = 0;
+	int i = 0;
 
 
 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
 
 
 	if (rank == 0 || rank == 1)
 	if (rank == 0 || rank == 1)
 	{
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			recv_reqs[i] = NULL;
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -176,7 +181,7 @@ void burst_bidir_half_postponed(int rank)
 
 
 	if (rank == 0 || rank == 1)
 	if (rank == 0 || rank == 1)
 	{
 	{
-		for (int i = 0; i < (burst_nb_requests / 2); i++)
+		for (i = 0; i < (burst_nb_requests / 2); i++)
 		{
 		{
 			send_reqs[i] = NULL;
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
@@ -184,13 +189,13 @@ void burst_bidir_half_postponed(int rank)
 
 
 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
 
 
-		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
+		for (i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
 		{
 		{
 			send_reqs[i] = NULL;
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 		}
 
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);

+ 10 - 5
mpi/examples/benchs/gemm_helper.c

@@ -98,8 +98,9 @@ static void cpu_init_matrix_random(void *descr[], void *arg)
 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 	{
 		subA[i] = (TYPE) (starpu_drand48());
 		subA[i] = (TYPE) (starpu_drand48());
 		subB[i] = (TYPE) (starpu_drand48());
 		subB[i] = (TYPE) (starpu_drand48());
@@ -113,8 +114,9 @@ static void cpu_init_matrix_zero(void *descr[], void *arg)
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 	{
 		subA[i] = (TYPE) (0);
 		subA[i] = (TYPE) (0);
 	}
 	}
@@ -290,18 +292,21 @@ void gemm_add_polling_dependencies()
 {
 {
 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
 	unsigned nb_workers = starpu_worker_get_count();
 	unsigned nb_workers = starpu_worker_get_count();
+	starpu_tag_t synchro_tag = 0;
+	starpu_tag_t previous_tag = 0;
+	starpu_tag_t next_tag = 0;
 
 
-	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	for (synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
 	{
 	{
 		// this synchro tag depends on tasks of previous column of tasks:
 		// this synchro tag depends on tasks of previous column of tasks:
-		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		for (previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
 		{
 		{
 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
 		}
 		}
 
 
 		// tasks of the next column of tasks depend on this synchro tag:
 		// tasks of the next column of tasks depend on this synchro tag:
 		// this actually allows workers to poll for new tasks, while no task is available
 		// this actually allows workers to poll for new tasks, while no task is available
-		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		for (next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
 		{
 		{
 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
 		}
 		}

+ 2 - 1
mpi/examples/benchs/sendrecv_bench.c

@@ -27,9 +27,10 @@ int main(int argc, char **argv)
 {
 {
 	int ret, rank, worldsize;
 	int ret, rank, worldsize;
 	int pause_workers = 0;
 	int pause_workers = 0;
+	int i = 0;
 
 
 
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 	{
 		if (strcmp(argv[i], "-p") == 0)
 		if (strcmp(argv[i], "-p") == 0)
 		{
 		{

+ 10 - 6
mpi/examples/benchs/sendrecv_parallel_tasks_bench.c

@@ -56,6 +56,8 @@ void cpu_task(void* descr[], void* args)
 	double t1, t2;
 	double t1, t2;
 	int asked_worker;
 	int asked_worker;
 	int current_worker = starpu_worker_get_id();
 	int current_worker = starpu_worker_get_id();
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 
 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
 
 
@@ -64,7 +66,7 @@ void cpu_task(void* descr[], void* args)
 	iterations = bench_nb_iterations(iterations, s);
 	iterations = bench_nb_iterations(iterations, s);
 	double* lats = malloc(sizeof(double) * iterations);
 	double* lats = malloc(sizeof(double) * iterations);
 
 
-	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
+	for (j = 0; j < NB_WARMUP_PINGPONGS; j++)
 	{
 	{
 		if (mpi_rank == 0)
 		if (mpi_rank == 0)
 		{
 		{
@@ -78,7 +80,7 @@ void cpu_task(void* descr[], void* args)
 		}
 		}
 	}
 	}
 
 
-	for (uint64_t j = 0; j < iterations; j++)
+	for (j = 0; j < iterations; j++)
 	{
 	{
 		if (mpi_rank == 0)
 		if (mpi_rank == 0)
 		{
 		{
@@ -107,7 +109,7 @@ void cpu_task(void* descr[], void* args)
 		const double d9_lat = lats[9 * (iterations - 1) / 10];
 		const double d9_lat = lats[9 * (iterations - 1) / 10];
 		double avg_lat = 0.0;
 		double avg_lat = 0.0;
 
 
-		for(uint64_t k = 0; k < iterations; k++)
+		for(k = 0; k < iterations; k++)
 		{
 		{
 			avg_lat += lats[k];
 			avg_lat += lats[k];
 		}
 		}
@@ -167,6 +169,8 @@ int main(int argc, char **argv)
 	unsigned cpu_count = starpu_cpu_worker_get_count();
 	unsigned cpu_count = starpu_cpu_worker_get_count();
 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
 	unsigned tag = 0;
 	unsigned tag = 0;
+	uint64_t s = 0;
+	unsigned i = 0;
 
 
 	int* workers = malloc(cpu_count * sizeof(int));
 	int* workers = malloc(cpu_count * sizeof(int));
 	float** vectors_send = malloc(cpu_count * sizeof(float*));
 	float** vectors_send = malloc(cpu_count * sizeof(float*));
@@ -174,11 +178,11 @@ int main(int argc, char **argv)
 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
 
 
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 	{
 		starpu_pause();
 		starpu_pause();
 
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 		{
 			workers[i] = i;
 			workers[i] = i;
 			vectors_send[i] = malloc(s);
 			vectors_send[i] = malloc(s);
@@ -201,7 +205,7 @@ int main(int argc, char **argv)
 		starpu_resume();
 		starpu_resume();
 		starpu_task_wait_for_all();
 		starpu_task_wait_for_all();
 
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 		{
 			starpu_data_unregister(handles_send[i]);
 			starpu_data_unregister(handles_send[i]);
 			starpu_data_unregister(handles_recv[i]);
 			starpu_data_unregister(handles_recv[i]);

+ 1 - 1
mpi/examples/filters/filter.c

@@ -59,7 +59,7 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
 
 
 	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
 	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
 	STARPU_ASSERT(nchunks == 2);
 	STARPU_ASSERT(nchunks == 2);
-	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%d is not a multiple of nchunks %d\n", nx, nchunks);
+	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%u is not a multiple of nchunks %u\n", nx, nchunks);
 
 
 	vector_child->id = vector_father->id;
 	vector_child->id = vector_father->id;
 	vector_child->nx = nx/2;
 	vector_child->nx = nx/2;

+ 149 - 41
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -20,6 +20,64 @@
 #include <limits.h>
 #include <limits.h>
 #include <math.h>
 #include <math.h>
 
 
+/* This is from magma
+
+  -- Innovative Computing Laboratory
+  -- Electrical Engineering and Computer Science Department
+  -- University of Tennessee
+  -- (C) Copyright 2009
+
+  Redistribution  and  use  in  source and binary forms, with or without
+  modification,  are  permitted  provided  that the following conditions
+  are met:
+
+  * Redistributions  of  source  code  must  retain  the above copyright
+    notice,  this  list  of  conditions  and  the  following  disclaimer.
+  * Redistributions  in  binary  form must reproduce the above copyright
+    notice,  this list of conditions and the following disclaimer in the
+    documentation  and/or other materials provided with the distribution.
+  * Neither  the  name of the University of Tennessee, Knoxville nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  */
+
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+
+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
+
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
+
+#define FMULS_TRSM FMULS_TRMM
+#define FADDS_TRSM FMULS_TRMM
+
+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
+
+
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+
+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
+
+/* End of magma code */
+
 /*
 /*
  *	Create the codelets
  *	Create the codelets
  */
  */
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 {
 {
 	unsigned k, m, n;
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 				       STARPU_RW, data_handles[k][k],
 				       STARPU_RW, data_handles[k][k],
+				       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 				       0);
 				       0);
 
 
 		for (m = k+1; m<nblocks; m++)
 		for (m = k+1; m<nblocks; m++)
@@ -88,28 +148,30 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
 					       STARPU_R, data_handles[k][k],
 					       STARPU_RW, data_handles[m][k],
 					       STARPU_RW, data_handles[m][k],
+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 					       0);
 					       0);
 
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			if (my_distrib(k, k, nodes) == rank)
 			if (my_distrib(k, k, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][k]);
 				starpu_data_wont_use(data_handles[k][k]);
+		}
 
 
-			for (n = k+1; n<nblocks; n++)
+		for (n = k+1; n<nblocks; n++)
+		{
+			for (m = n; m<nblocks; m++)
 			{
 			{
-				if (n <= m)
-				{
-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[n][k],
-							       STARPU_R, data_handles[m][k],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
-							       0);
-				}
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+						       0);
 			}
 			}
 
 
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-			if (my_distrib(m, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[m][k]);
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+			if (my_distrib(n, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[n][k]);
 		}
 		}
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 {
 {
 	unsigned k, m, n;
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 
 	/* Column */
 	/* Column */
 	for (n = 0; n<nblocks; n++)
 	for (n = 0; n<nblocks; n++)
@@ -137,7 +200,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
 						       0);
+
+				if (m == n)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+					starpu_data_wont_use(data_handles[m][k]);
+				}
 			}
 			}
 			k = n;
 			k = n;
 			if (m > n)
 			if (m > n)
@@ -147,6 +218,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 						       0);
 			}
 			}
 			else
 			else
@@ -155,26 +227,27 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 						       0);
 			}
 			}
+
 		}
 		}
 
 
+		/* We won't need it any more */
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+		starpu_data_wont_use(data_handles[n][n]);
+
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 }
 
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
 {
-	unsigned a, c;
+	unsigned a;
 	unsigned k, m, n;
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 
 	/* double-antidiagonal number:
 	/* double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=0 contains (0,0) plus (1,0)
@@ -205,7 +278,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 			}
 
 
 			/* k = n */
 			/* k = n */
@@ -216,6 +297,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 						       0);
 			}
 			}
 			else
 			else
@@ -224,8 +306,16 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 						       0);
 			}
 			}
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 		}
 
 
 		/* column within second antidiagonal for a */
 		/* column within second antidiagonal for a */
@@ -246,7 +336,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 			}
 			/* non-diagonal block, solve */
 			/* non-diagonal block, solve */
 			k = n;
 			k = n;
@@ -254,17 +352,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
 					       STARPU_R, data_handles[k][k],
 					       STARPU_RW, data_handles[m][k],
 					       STARPU_RW, data_handles[m][k],
+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 					       0);
 					       0);
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 		}
 
 
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 }
 
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
@@ -273,9 +373,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	unsigned a;
 	unsigned a;
 	int k, m, n;
 	int k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 
 	/*
 	/*
-	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
 	 * double-antidiagonal number:
 	 * double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
@@ -285,41 +386,47 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	{
 	{
 		starpu_iteration_push(a);
 		starpu_iteration_push(a);
 
 
-		for (k = 0; k < nblocks; k++)
+		for (k = 0; k < (int) nblocks; k++)
 		{
 		{
 			n = k;
 			n = k;
 			/* Should be m = a-k-n; for potrf and trsm to respect
 			/* Should be m = a-k-n; for potrf and trsm to respect
 			   priorities, but needs to be this for dependencies */
 			   priorities, but needs to be this for dependencies */
 			m = a-2*k-n;
 			m = a-2*k-n;
 
 
-			if (m < 0 || m >= nblocks)
-				continue;
-
 			if (m == n)
 			if (m == n)
 			{
 			{
 				/* diagonal block, factorize */
 				/* diagonal block, factorize */
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 						       0);
 			}
 			}
-			else
+			else if (m >= n && m < (int) nblocks)
 			{
 			{
 				/* non-diagonal block, solve */
 				/* non-diagonal block, solve */
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 						       0);
 			}
 			}
 
 
+			if (m == (int) nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
+
 			/* column within antidiagonal for a */
 			/* column within antidiagonal for a */
-			for (n = k + 1; n < nblocks; n++)
+			for (n = k + 1; n < (int) nblocks; n++)
 			{
 			{
 				/* row */
 				/* row */
 				m = a-2*k-n;
 				m = a-2*k-n;
 
 
-				if (m >= n && m < nblocks)
+				if (m >= n && m < (int) nblocks)
 				{
 				{
 					/* Update */
 					/* Update */
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
@@ -327,7 +434,14 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 							       STARPU_R, data_handles[n][k],
 							       STARPU_R, data_handles[n][k],
 							       STARPU_R, data_handles[m][k],
 							       STARPU_R, data_handles[m][k],
 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 							       0);
 							       0);
+					if (m == (int) nblocks - 1)
+					{
+						/* Nobody else will need it */
+						starpu_data_wont_use(data_handles[n][k]);
+						starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					}
 				}
 				}
 			}
 			}
 
 
@@ -335,12 +449,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
 
 		starpu_iteration_pop();
 		starpu_iteration_pop();
 	}
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 }
 
 
 /*
 /*
@@ -423,7 +531,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	if (rank == 0)
 	if (rank == 0)
 	{
 	{
 		*timing = end - start;
 		*timing = end - start;
-		*flops = (1.0f*size*size*size)/3.0f;
+		*flops = FLOPS_SPOTRF(size);
 	}
 	}
 }
 }
 
 

+ 14 - 11
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -56,63 +56,66 @@ void parse_args(int argc, char **argv, int nodes)
                         size = strtol(argv[++i], &argptr, 10);
                         size = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
-                if (strcmp(argv[i], "-dblockx") == 0)
+                else if (strcmp(argv[i], "-dblockx") == 0)
                 {
                 {
                         char *argptr;
                         char *argptr;
                         dblockx = strtol(argv[++i], &argptr, 10);
                         dblockx = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
-                if (strcmp(argv[i], "-dblocky") == 0)
+                else if (strcmp(argv[i], "-dblocky") == 0)
                 {
                 {
                         char *argptr;
                         char *argptr;
                         dblocky = strtol(argv[++i], &argptr, 10);
                         dblocky = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
-                if (strcmp(argv[i], "-nblocks") == 0)
+                else if (strcmp(argv[i], "-nblocks") == 0)
                 {
                 {
                         char *argptr;
                         char *argptr;
                         nblocks = strtol(argv[++i], &argptr, 10);
                         nblocks = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
-                if (strcmp(argv[i], "-nbigblocks") == 0)
+                else if (strcmp(argv[i], "-nbigblocks") == 0)
                 {
                 {
                         char *argptr;
                         char *argptr;
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
                 }
 
 
-                if (strcmp(argv[i], "-columns") == 0)
+                else if (strcmp(argv[i], "-columns") == 0)
                 {
                 {
                         submission = COLUMNS;
                         submission = COLUMNS;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-antidiagonals") == 0)
+                else if (strcmp(argv[i], "-antidiagonals") == 0)
                 {
                 {
                         submission = ANTIDIAGONALS;
                         submission = ANTIDIAGONALS;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-prios") == 0)
+                else if (strcmp(argv[i], "-prios") == 0)
                 {
                 {
                         submission = PRIOS;
                         submission = PRIOS;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-no-prio") == 0)
+                else if (strcmp(argv[i], "-no-prio") == 0)
                 {
                 {
                         noprio = 1;
                         noprio = 1;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-check") == 0)
+                else if (strcmp(argv[i], "-check") == 0)
                 {
                 {
                         check = 1;
                         check = 1;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-display") == 0)
+                else if (strcmp(argv[i], "-display") == 0)
                 {
                 {
                         display = 1;
                         display = 1;
                 }
                 }
 
 
-                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+                else
+                /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) */
                 {
                 {
                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
+                        fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
+                        exit(0);
                 }
                 }
         }
         }
 
 

+ 23 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -37,8 +37,8 @@
 static unsigned long size = 4096;
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
 static unsigned check = 0;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 static unsigned no_prio = 0;
 
 
@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 	int rank;
 	int rank;
 	int world_size;
 	int world_size;
 	int ret;
 	int ret;
+	unsigned i, j;
 
 
 	/*
 	/*
 	 *	Initialization
 	 *	Initialization
@@ -462,7 +463,14 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 	starpu_data_set_default_sequential_consistency_flag(0);
 
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
@@ -594,6 +602,18 @@ int main(int argc, char **argv)
 	/*
 	/*
 	 * 	Termination
 	 * 	Termination
 	 */
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
+	free(dataA_handles);
+	free(dataA);
 
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

+ 13 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 	int rank;
 	int rank;
 	int world_size;
 	int world_size;
 	int ret;
 	int ret;
+	unsigned i, j;
 
 
 	starpu_srand48((long int)time(NULL));
 	starpu_srand48((long int)time(NULL));
 
 
@@ -376,6 +377,18 @@ int main(int argc, char **argv)
 	/*
 	/*
 	 * 	Termination
 	 * 	Termination
 	 */
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
+	free(dataA_handles);
+	free(dataA);
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 26 - 11
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -39,9 +39,10 @@
 
 
 static unsigned long size = 4096;
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
+static size_t blocksize;
 static unsigned check = 0;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 static unsigned no_prio = 0;
 static char *path = "./starpu-ooc-files";
 static char *path = "./starpu-ooc-files";
@@ -53,6 +54,9 @@ static unsigned numa = 0;
 static size_t allocated_memory = 0;
 static size_t allocated_memory = 0;
 
 
 static starpu_data_handle_t *dataA_handles;
 static starpu_data_handle_t *dataA_handles;
+static void **disk_objs;
+
+static int disk_node;
 
 
 int get_block_rank(unsigned i, unsigned j);
 int get_block_rank(unsigned i, unsigned j);
 
 
@@ -142,7 +146,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnbl
 
 
 static void create_matrix()
 static void create_matrix()
 {
 {
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
 	TYPE *blockptr = malloc(blocksize);
 	TYPE *blockptr = malloc(blocksize);
 	int fd;
 	int fd;
 	char *filename;
 	char *filename;
@@ -195,10 +198,9 @@ static void init_matrix(int rank)
 {
 {
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	disk_objs = calloc(nblocks*nblocks, sizeof(*disk_objs));
 
 
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-
-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+	disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(16*1024*1024, size*size*sizeof(TYPE)));
 	assert(disk_node >= 0);
 	assert(disk_node >= 0);
 
 
 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
@@ -215,21 +217,21 @@ static void init_matrix(int rank)
 
 
 			if (block_rank == rank)
 			if (block_rank == rank)
 			{
 			{
-				void *disk_obj;
 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
 				/* Register it to StarPU */
 				/* Register it to StarPU */
-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
-				if (!disk_obj)
+				disk_objs[j+nblocks*i] = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_objs[j+nblocks*i])
 				{
 				{
 					fprintf(stderr,"could not open %s\n", filename);
 					fprintf(stderr,"could not open %s\n", filename);
 					exit(1);
 					exit(1);
 				}
 				}
 				starpu_matrix_data_register(handleptr, disk_node,
 				starpu_matrix_data_register(handleptr, disk_node,
-					(uintptr_t) disk_obj, size/nblocks,
+					(uintptr_t) disk_objs[j+nblocks*i], size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 					size/nblocks, size/nblocks, sizeof(TYPE));
 			}
 			}
 			else
 			else
 			{
 			{
+				disk_objs[j+nblocks*i] = NULL;
 				starpu_matrix_data_register(handleptr, -1,
 				starpu_matrix_data_register(handleptr, -1,
 					0, size/nblocks,
 					0, size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 					size/nblocks, size/nblocks, sizeof(TYPE));
@@ -273,6 +275,8 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
+	blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
 	ret = mkdir(path, 0777);
 	ret = mkdir(path, 0777);
 	if (ret != 0 && errno != EEXIST)
 	if (ret != 0 && errno != EEXIST)
 	{
 	{
@@ -286,7 +290,14 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
@@ -401,8 +412,12 @@ int main(int argc, char **argv)
 		for (i = 0; i < nblocks; i++)
 		for (i = 0; i < nblocks; i++)
 		{
 		{
 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			if (disk_objs[j+nblocks*i])
+				starpu_disk_close(disk_node, disk_objs[j+nblocks*i], blocksize);
 		}
 		}
 	}
 	}
+	free(dataA_handles);
+	free(disk_objs);
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 18 - 14
mpi/src/mpi/starpu_mpi_mpi.c

@@ -203,7 +203,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			else
 			else
 			{
 			{
 				STARPU_ASSERT(req->count);
 				STARPU_ASSERT(req->count);
-				_STARPU_MPI_MALLOC(req->ptr, req->count);
+				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 			}
 			}
 
 
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
@@ -225,12 +225,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 
 
-			/* Case: a receive request for a data with the given tag and source has already been
-			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
-			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
-			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			if (early_data_handle)
 			{
 			{
+				/* Case: a receive request for a data with the given tag and source has already been
+				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+				 * will be called to bring the data back to the original data handle associated to the request.*/
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
 				while (!(early_data_handle->req_ready))
 				while (!(early_data_handle->req_ready))
@@ -254,16 +254,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			}
-			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			else
 			{
 			{
 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				if (sync_req)
 				{
 				{
+					/* Case: we already received the send envelope, we can proceed with the receive */
 					req->sync = 1;
 					req->sync = 1;
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					if (req->registered_datatype == 1)
 					if (req->registered_datatype == 1)
@@ -275,14 +275,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 					{
 					{
 						req->count = sync_req->count;
 						req->count = sync_req->count;
 						STARPU_ASSERT(req->count);
 						STARPU_ASSERT(req->count);
-						_STARPU_MPI_MALLOC(req->ptr, req->count);
+						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 					}
 					}
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
+					/* Throw away the dumb request that was only used to know that we got the envelope */
 					_starpu_mpi_request_destroy(sync_req);
 					_starpu_mpi_request_destroy(sync_req);
 				}
 				}
 				else
 				else
 				{
 				{
+					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_starpu_mpi_early_request_enqueue(req);
 					_starpu_mpi_early_request_enqueue(req);
 				}
 				}
@@ -684,6 +686,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 
 
+	STARPU_VALGRIND_YIELD();
+
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
 	if (*flag)
 	if (*flag)
@@ -908,6 +912,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
+/* This is called when the data is now received in the early data handle, we can
+ * now copy it over to the real handle. */
 static void _starpu_mpi_early_data_cb(void* arg)
 static void _starpu_mpi_early_data_cb(void* arg)
 {
 {
 	struct _starpu_mpi_early_data_cb_args *args = arg;
 	struct _starpu_mpi_early_data_cb_args *args = arg;
@@ -954,7 +960,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	}
 	}
 
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
-	starpu_data_release(args->early_handle);
+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
 
 
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	/* XXX: note that we have already freed the registered buffer above. In
 	/* XXX: note that we have already freed the registered buffer above. In
@@ -1101,8 +1107,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 
-		starpu_wake_all_blocked_workers();
-
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
@@ -1204,14 +1208,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 	}
 
 
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
 	{
 		char hostname[65];
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	_starpu_mpi_do_initialize(argc_argv);
-	if (_starpu_mpi_thread_cpuid >= 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
 		/* In case MPI changed the binding */
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #else
 #else
@@ -1450,7 +1454,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						else
 						else
 						{
 						{
 							early_request->count = envelope->size;
 							early_request->count = envelope->size;
-							_STARPU_MPI_MALLOC(early_request->ptr, early_request->count);
+							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
 
 
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);

+ 7 - 4
mpi/src/nmad/starpu_mpi_nmad.c

@@ -245,6 +245,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
 
+	STARPU_VALGRIND_YIELD();
+
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 
 	/* we must do a test_locked to avoid race condition :
 	/* we must do a test_locked to avoid race condition :
@@ -344,7 +346,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 				// req->ptr is freed by starpu_data_unpack
 				// req->ptr is freed by starpu_data_unpack
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 			else
 			else
-				free(req->ptr);
+				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -451,7 +453,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
 	{
 		char hostname[65];
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
@@ -623,7 +625,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	 * required for piom_ltask_set_bound_thread_indexes() */
 	 * required for piom_ltask_set_bound_thread_indexes() */
 	_starpu_mpi_do_initialize(argc_argv);
 	_starpu_mpi_do_initialize(argc_argv);
 
 
-	if (_starpu_mpi_thread_cpuid < 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid < 0)
 	{
 	{
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 	}
@@ -633,7 +635,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	/* Tell pioman to use a bound thread for communication progression:
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
+	if (!_starpu_mpi_nobind)
+		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
 
 
 	/* Register some hooks for communication progress if needed */
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;
 	int polling_point_prog, polling_point_idle;

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -130,7 +130,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
 
 
-		req->ptr = malloc(req->count);
+		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 
 
 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);
 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);

+ 2 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 			/* We were last, release data */
 			/* We were last, release data */
 			free(coop_sends->reqs_array);
 			free(coop_sends->reqs_array);
 			free(coop_sends);
 			free(coop_sends);
-			starpu_data_release(req->data_handle);
+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 		}
 		}
 	}
 	}
 	else
 	else
 	{
 	{
 		/* Trivial request */
 		/* Trivial request */
-		starpu_data_release(req->data_handle);
+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 	}
 	}
 }
 }
 
 

+ 1 - 1
mpi/src/starpu_mpi_fxt.h

@@ -72,7 +72,7 @@ extern "C"
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\

+ 2 - 0
mpi/src/starpu_mpi_private.c

@@ -22,6 +22,7 @@ int _starpu_debug_level_max=0;
 int _starpu_mpi_tag = 42;
 int _starpu_mpi_tag = 42;
 int _starpu_mpi_comm_debug;
 int _starpu_mpi_comm_debug;
 
 
+int _starpu_mpi_nobind = -1;
 int _starpu_mpi_thread_cpuid = -1;
 int _starpu_mpi_thread_cpuid = -1;
 int _starpu_mpi_use_prio = 1;
 int _starpu_mpi_use_prio = 1;
 int _starpu_mpi_fake_world_size = -1;
 int _starpu_mpi_fake_world_size = -1;
@@ -62,6 +63,7 @@ void _starpu_mpi_env_init(void)
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
+	_starpu_mpi_nobind = starpu_get_env_number_default("STARPU_MPI_NOBIND", 0);
 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);

+ 2 - 1
mpi/src/starpu_mpi_private.h

@@ -61,6 +61,7 @@ void _starpu_mpi_set_debug_level_max(int level);
 extern int _starpu_mpi_fake_world_size;
 extern int _starpu_mpi_fake_world_size;
 extern int _starpu_mpi_fake_world_rank;
 extern int _starpu_mpi_fake_world_rank;
 extern int _starpu_mpi_use_prio;
 extern int _starpu_mpi_use_prio;
+extern int _starpu_mpi_nobind;
 extern int _starpu_mpi_thread_cpuid;
 extern int _starpu_mpi_thread_cpuid;
 extern int _starpu_mpi_use_coop_sends;
 extern int _starpu_mpi_use_coop_sends;
 extern int _starpu_mpi_mem_throttle;
 extern int _starpu_mpi_mem_throttle;
@@ -200,7 +201,7 @@ struct _starpu_mpi_data
 {
 {
 	int magic;
 	int magic;
 	struct _starpu_mpi_node_tag node_tag;
 	struct _starpu_mpi_node_tag node_tag;
-	int *cache_sent;
+	char *cache_sent;
 	int cache_received;
 	int cache_received;
 
 
 	/** Rendez-vous data for opportunistic cooperative sends */
 	/** Rendez-vous data for opportunistic cooperative sends */

+ 2 - 2
mpi/tests/Makefile.am

@@ -45,10 +45,10 @@ endif
 endif
 endif
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
 
 
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK

+ 1 - 1
mpi/tests/broadcast.c

@@ -40,7 +40,7 @@ int main(int argc, char **argv)
 {
 {
 	int ret, rank, size;
 	int ret, rank, size;
 	starpu_data_handle_t handle;
 	starpu_data_handle_t handle;
-	int var;
+	int var=-1;
 	int mpi_init;
 	int mpi_init;
 	MPI_Status status;
 	MPI_Status status;
 
 

+ 18 - 2
mpi/tests/early_request.c

@@ -109,6 +109,18 @@ void submitted_order_fun(void *buffers[], void *cl_arg)
 	(void)cl_arg;
 	(void)cl_arg;
 }
 }
 
 
+static struct starpu_codelet submitted_order_rw =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {submitted_order_fun, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+#ifdef STARPU_SIMGRID
+	.model = &starpu_perfmodel_nop,
+#endif
+	.name = "submitted_order_enforcer"
+};
+
 static struct starpu_codelet submitted_order =
 static struct starpu_codelet submitted_order =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
@@ -156,11 +168,15 @@ void insert_work_for_one_element(struct element *el)
 			   STARPU_W,tmp_send,
 			   STARPU_W,tmp_send,
 			   0);
 			   0);
 	//Send operation
 	//Send operation
-	starpu_insert_task(&submitted_order,
+	starpu_insert_task(&submitted_order_rw,
 			   STARPU_RW,el->ensure_submitted_order_send,
 			   STARPU_RW,el->ensure_submitted_order_send,
-			   STARPU_W,tmp_send,
+			   STARPU_RW,tmp_send,
 			   0);
 			   0);
 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
+	starpu_insert_task(&submitted_order_rw,
+			   STARPU_RW,el->ensure_submitted_order_send,
+			   STARPU_RW,tmp_send,
+			   0);
 
 
 	//Recv operation for current element
 	//Recv operation for current element
 	starpu_insert_task(&submitted_order,
 	starpu_insert_task(&submitted_order,

+ 1 - 1
mpi/tests/insert_task_compute.c

@@ -47,7 +47,7 @@ int test(int rank, int node, int *before, int *after, int task_insert, int data_
 	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
 	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 
-	if (starpu_cpu_worker_get_count() <= 0)
+	if (starpu_cpu_worker_get_count() == 0)
 	{
 	{
 		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
 		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
 		ret = -ENODEV;
 		ret = -ENODEV;

+ 5 - 3
mpi/tests/pingpong.c

@@ -43,13 +43,14 @@ int main(int argc, char **argv)
 {
 {
 	int ret, rank, size;
 	int ret, rank, size;
 	int mpi_init;
 	int mpi_init;
+	int i;
 
 
 	int niter = DEFAULT_NITER;
 	int niter = DEFAULT_NITER;
 	int data_size = DEFAULT_DATA_SIZE;
 	int data_size = DEFAULT_DATA_SIZE;
 	int sleep_time = DEFAULT_SLEEP_TIME;
 	int sleep_time = DEFAULT_SLEEP_TIME;
 	int method = DEFAULT_METHOD;
 	int method = DEFAULT_METHOD;
 
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 	{
 		if (strcmp(argv[i], "-n") == 0)
 		if (strcmp(argv[i], "-n") == 0)
 		{
 		{
@@ -134,6 +135,7 @@ int main(int argc, char **argv)
 	int loop;
 	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int sender;
 	int sender;
+	int r = 0;
 
 
 	if (method == 0) // ping pongs
 	if (method == 0) // ping pongs
 	{
 	{
@@ -161,7 +163,7 @@ int main(int argc, char **argv)
 			sender = loop % size;
 			sender = loop % size;
 			if (sender == rank)
 			if (sender == rank)
 			{
 			{
-				for (int r = 0; r < size; r++)
+				for (r = 0; r < size; r++)
 				{
 				{
 					if (r != rank)
 					if (r != rank)
 					{
 					{
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 				MPI_Status status;
 				MPI_Status status;
 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
 
 
-				for (int r = 0; r < (size-1); r++)
+				for (r = 0; r < (size-1); r++)
 					starpu_sleep(sleep_time / 1000);
 					starpu_sleep(sleep_time / 1000);
 			}
 			}
 		}
 		}

+ 11 - 0
src/common/fxt.c

@@ -27,6 +27,7 @@ unsigned long _starpu_job_cnt = 0;
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 #include <common/fxt.h>
 #include <common/fxt.h>
 #include <starpu_fxt.h>
 #include <starpu_fxt.h>
+#include <sys/stat.h>
 
 
 #ifdef STARPU_HAVE_WINDOWS
 #ifdef STARPU_HAVE_WINDOWS
 #include <windows.h>
 #include <windows.h>
@@ -95,6 +96,16 @@ static void _starpu_profile_set_tracefile(void)
 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
 	if (!fxt_prefix)
 	if (!fxt_prefix)
 	     fxt_prefix = "/tmp/";
 	     fxt_prefix = "/tmp/";
+	else
+	{
+		// Check if the given folder really exists:
+		struct stat folder_stat;
+		if (stat(fxt_prefix, &folder_stat) < 0 || !S_ISDIR(folder_stat.st_mode))
+		{
+			_STARPU_MSG("%s is not a valid directory.\n", fxt_prefix);
+			_starpu_abort();
+		}
+	}
 
 
 	user = starpu_getenv("USER");
 	user = starpu_getenv("USER");
 	if (!user)
 	if (!user)

+ 2 - 0
src/common/utils.h

@@ -85,8 +85,10 @@
 #define _STARPU_UYIELD() ((void)0)
 #define _STARPU_UYIELD() ((void)0)
 #endif
 #endif
 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
+#define STARPU_VALGRIND_YIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); } while (0)
 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
 #else
 #else
+#define STARPU_VALGRIND_YIELD() do { } while (0)
 #define STARPU_UYIELD() _STARPU_UYIELD()
 #define STARPU_UYIELD() _STARPU_UYIELD()
 #endif
 #endif
 
 

+ 4 - 16
src/core/dependencies/cg.c

@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 	return n;
 	return n;
 }
 }
 
 
-/* Note: in case of a tag, it must be already locked */
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				struct _starpu_tag *tag;
 				struct _starpu_tag *tag;
 
 
 				tag = cg->succ.tag;
 				tag = cg->succ.tag;
+				_starpu_spin_lock(&tag->lock);
 				tag_successors = &tag->tag_successors;
 				tag_successors = &tag->tag_successors;
 
 
 				tag_successors->ndeps_completed++;
 				tag_successors->ndeps_completed++;
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				{
 				{
 					/* reset the counter so that we can reuse the completion group */
 					/* reset the counter so that we can reuse the completion group */
 					tag_successors->ndeps_completed = 0;
 					tag_successors->ndeps_completed = 0;
+					/* This releases the lock */
 					_starpu_tag_set_ready(tag);
 					_starpu_tag_set_ready(tag);
-				}
+				} else
+					_starpu_spin_unlock(&tag->lock);
 				break;
 				break;
 			}
 			}
 
 
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 			successors->nsuccs--;
 			successors->nsuccs--;
 		}
 		}
 		_starpu_spin_unlock(&successors->lock);
 		_starpu_spin_unlock(&successors->lock);
-
-		struct _starpu_tag *cgtag = NULL;
-
-		if (cg_type == STARPU_CG_TAG)
-		{
-			cgtag = cg->succ.tag;
-			STARPU_ASSERT(cgtag);
-			_starpu_spin_lock(&cgtag->lock);
-		}
-
 		_starpu_notify_cg(pred, cg);
 		_starpu_notify_cg(pred, cg);
-
-		if (cg_type == STARPU_CG_TAG)
-			_starpu_spin_unlock(&cgtag->lock);
-
 		_starpu_spin_lock(&successors->lock);
 		_starpu_spin_lock(&successors->lock);
 	}
 	}
 	successors->terminated = 1;
 	successors->terminated = 1;

+ 31 - 9
src/core/dependencies/data_arbiter_concurrency.c

@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 }
 }
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 #else // LOCK_OR_DELEGATE
 #else // LOCK_OR_DELEGATE
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 #endif
 #endif
 {
 {
 	starpu_arbiter_t arbiter = handle->arbiter;
 	starpu_arbiter_t arbiter = handle->arbiter;
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 	{
 	{
 		/* No waiter, just remove our reference */
 		/* No waiter, just remove our reference */
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
-		STARPU_ASSERT(handle->refcnt > 0);
-		handle->refcnt--;
-		STARPU_ASSERT(handle->busy_count > 0);
-		handle->busy_count--;
+		if (down_to_mode == STARPU_NONE)
+		{
+			STARPU_ASSERT(handle->refcnt > 0);
+			handle->refcnt--;
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+		}
+		else
+		{
+			/* Downgrade from W or RW down to R, keeping the same reference,
+			 * but thus allowing other readers without allowing writers.  */
+			STARPU_ASSERT(down_to_mode == STARPU_R &&
+				      handle->current_mode == STARPU_W);
+			handle->current_mode = down_to_mode;
+		}
 #ifndef LOCK_OR_DELEGATE
 #ifndef LOCK_OR_DELEGATE
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 #endif
 #endif
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
 
 	/* There is a waiter, remove our reference */
 	/* There is a waiter, remove our reference */
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
+	if (down_to_mode == STARPU_NONE)
+	{
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+			      handle->current_mode == STARPU_W);
+		handle->current_mode = down_to_mode;
+	}
 	/* There should be at least one busy_count reference for the waiter
 	/* There should be at least one busy_count reference for the waiter
 	 * (thus we don't risk to see the handle disappear below) */
 	 * (thus we don't risk to see the handle disappear below) */
 	STARPU_ASSERT(handle->busy_count > 0);
 	STARPU_ASSERT(handle->busy_count > 0);

+ 29 - 11
src/core/dependencies/data_concurrency.c

@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
  * This may free the handle if it was lazily unregistered (1 is returned in
  * This may free the handle if it was lazily unregistered (1 is returned in
  * that case). The handle pointer thus becomes invalid for the caller.
  * that case). The handle pointer thus becomes invalid for the caller.
  */
  */
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 {
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 	_starpu_spin_checklocked(&handle->header_lock);
 
 
+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
+	{
+		/* No change, nothing to do */
+		return 0;
+	}
+
 	if (handle->arbiter)
 	if (handle->arbiter)
 	{
 	{
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
-		_starpu_notify_arbitered_dependencies(handle);
+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
 		/* We have already unlocked */
 		/* We have already unlocked */
 		return 1;
 		return 1;
 	}
 	}
 
 
-	/* A data access has finished so we remove a reference. */
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
-	if (_starpu_data_check_not_busy(handle))
-		/* Handle was destroyed, nothing left to do.  */
-		return 1;
-
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 
 
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* A data access has finished so we remove a reference. */
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (_starpu_data_check_not_busy(handle))
+			/* Handle was destroyed, nothing left to do.  */
+			return 1;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+				(handle->current_mode == STARPU_RW ||
+				 handle->current_mode == STARPU_W));
+		handle->current_mode = down_to_mode;
+	}
+
 	/* In case there is a pending reduction, and that this is the last
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
 	 * requester, we may go back to a "normal" coherency model. */
 	if (handle->reduction_refcnt > 0)
 	if (handle->reduction_refcnt > 0)

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
 
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
 
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_data_access_mode mode,
 							  enum starpu_data_access_mode mode,

+ 14 - 4
src/core/dependencies/implicit_data_deps.c

@@ -225,10 +225,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 
 
+		if (mode & STARPU_R)
+			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		{
 		{
 
 
-			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handles can not be written to");
+			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handle %p can not be written to", handle);
 
 
 			handle->initialized = 1;
 			handle->initialized = 1;
 			/* We will change our value, disconnect from our readonly duplicates */
 			/* We will change our value, disconnect from our readonly duplicates */
@@ -613,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 }
 }
 
 
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
 	unsigned do_submit_tasks = 0;
+	unsigned last_cnt;
 
 
 	/* Here helgrind would shout that this is an unprotected access, but
 	/* Here helgrind would shout that this is an unprotected access, but
 	 * count can only be zero if we don't have to care about
 	 * count can only be zero if we don't have to care about
 	 * post_sync_tasks_cnt at all.  */
 	 * post_sync_tasks_cnt at all.  */
-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
+	if (handle->post_sync_tasks_cnt)
 	{
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		if (--handle->post_sync_tasks_cnt == 0)
+		last_cnt = handle->post_sync_tasks_cnt;
+
+		if (mode == STARPU_NONE)
+			/* Last release from us */
+			handle->post_sync_tasks_cnt--;
+
+		if (last_cnt == 1)
 		{
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			do_submit_tasks = 1;
 			do_submit_tasks = 1;

+ 1 - 1
src/core/dependencies/implicit_data_deps.h

@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 
 /** Register a hook to be called when a write is submitted */
 /** Register a hook to be called when a write is submitted */
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));

+ 16 - 11
src/core/dependencies/tags.c

@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 
 
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
+			{
 				/* Last tag this cg depends on, cg becomes unreferenced */
 				/* Last tag this cg depends on, cg becomes unreferenced */
+#ifdef STARPU_DEBUG
+				free(cg->deps);
+				free(cg->done);
+#endif
 				free(cg);
 				free(cg);
+			}
 		}
 		}
 
 
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 		free(tag->tag_successors.succ);
 		free(tag->tag_successors.succ);
 #endif
 #endif
+#ifdef STARPU_DEBUG
+		free(tag->tag_successors.deps);
+		free(tag->tag_successors.done);
+#endif
 
 
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 	return tag;
 	return tag;
 }
 }
 
 
-/* lock should be taken */
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 {
 {
 	/* mark this tag as ready to run */
 	/* mark this tag as ready to run */
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* declare it to the scheduler ! */
 	/* declare it to the scheduler ! */
 	struct _starpu_job *j = tag->job;
 	struct _starpu_job *j = tag->job;
 
 
+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
+
 	/* In case the task job is going to be scheduled immediately, and if
 	/* In case the task job is going to be scheduled immediately, and if
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the dependencies of the task, and therefore it would try to grab the
 	 * the dependencies of the task, and therefore it would try to grab the
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* enforce data dependencies */
 	/* enforce data dependencies */
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	_starpu_enforce_deps_starting_from_task(j);
 	_starpu_enforce_deps_starting_from_task(j);
-
-	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
 }
 }
 
 
-/* the lock must be taken ! */
+/* the lock of the tag must already be taken ! */
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(tag);
 	STARPU_ASSERT(tag);
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
 }
 }
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
 	va_end(pa);
 	va_end(pa);

+ 2 - 0
src/core/dependencies/tags.h

@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 
 
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
+
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);

+ 1 - 1
src/core/jobs.c

@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 
 	job->task = task;
 	job->task = task;
 
 
-#ifndef STARPU_USE_FXT
+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 #endif
 #endif
 	{
 	{

+ 3 - 0
src/core/perfmodel/multiple_regression.c

@@ -236,6 +236,9 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, unsigned long nn, unsigne
 	if( info != 0 )
 	if( info != 0 )
 	{
 	{
 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
+		free(X);
+		free(Y);
+		free(work);
 		return 1;
 		return 1;
 	}
 	}
 
 

+ 13 - 10
src/datawizard/coherency.c

@@ -862,7 +862,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
 
 /* in case the data was accessed on a write mode, do not forget to
 /* in case the data was accessed on a write mode, do not forget to
  * make it accessible again once it is possible ! */
  * make it accessible again once it is possible ! */
-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 {
 	uint32_t wt_mask;
 	uint32_t wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
@@ -887,14 +887,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	if (cpt == STARPU_SPIN_MAXTRY)
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 
 
-	/* Release refcnt taken by fetch_data_on_node */
-	replicate->refcnt--;
-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* Release refcnt taken by fetch_data_on_node */
+		replicate->refcnt--;
+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
 
 
-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
-	handle->busy_count--;
+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
+		handle->busy_count--;
+	}
 
 
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
@@ -1229,7 +1232,7 @@ enomem:
 
 
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		local_replicate = get_replicate(handle, mode, workerid, node);
 
 
-		_starpu_release_data_on_node(handle, 0, local_replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 	}
 	}
 
 
 	return -1;
 	return -1;
@@ -1338,13 +1341,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 		if (node == -1)
 		if (node == -1)
 		{
 		{
 			/* NOWHERE case, just notify dependencies */
 			/* NOWHERE case, just notify dependencies */
-			if (!_starpu_notify_data_dependencies(handle))
+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 				_starpu_spin_unlock(&handle->header_lock);
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 		}
 		else
 		else
 		{
 		{
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
-			_starpu_release_data_on_node(handle, 0, local_replicate);
+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 		}
 		}
 	}
 	}
 
 

+ 1 - 0
src/datawizard/coherency.h

@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
+				  enum starpu_data_access_mode down_to_mode,
 				  struct _starpu_data_replicate *replicate);
 				  struct _starpu_data_replicate *replicate);
 
 
 void _starpu_update_data_state(starpu_data_handle_t handle,
 void _starpu_update_data_state(starpu_data_handle_t handle,

+ 4 - 4
src/datawizard/copy_driver.c

@@ -375,8 +375,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 
 
-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+	if (ld2_src == blocksize * numblocks_1 &&
+	    ld2_dst == blocksize * numblocks_1)
 		/* Optimize contiguous case */
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
 					     dst, dst_offset, dst_node,
@@ -425,8 +425,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 
 
-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
 		/* Optimize contiguous case */
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
 					     dst, dst_offset, dst_node,

+ 6 - 2
src/datawizard/interfaces/data_interface.c

@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	{
 	{
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
 	}
 	else
 	else
 	{
 	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 }
 }
@@ -836,6 +836,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	int sequential_consistency = handle->sequential_consistency;
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency && !nowait)
 	if (sequential_consistency && !nowait)
 	{
 	{
+		/* We will acquire it in write mode to catch all dependencies,
+		 * but possibly it's not actually initialized. Fake it to avoid
+		 getting caught doing it */
+		handle->initialized = 1;
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
 
 
 		/* If sequential consistency is enabled, wait until data is available */
 		/* If sequential consistency is enabled, wait until data is available */

+ 31 - 8
src/datawizard/user_interactions.c

@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
 
 /* This function must be called after starpu_data_acquire so that the
 /* This function must be called after starpu_data_acquire so that the
  * application release the data */
  * application release the data */
-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 
 
+	if (mode == STARPU_RW)
+		/* They are equivalent here, and current_mode is never STARPU_RW */
+		mode = STARPU_W;
+
+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
+			  mode == handle->current_mode ||
+			  (mode == STARPU_R &&
+			     handle->current_mode == STARPU_W),
+		"We only support releasing from W to R");
+
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
-	_starpu_unlock_post_sync_tasks(handle);
+	_starpu_unlock_post_sync_tasks(handle, mode);
 
 
 	/* The application can now release the rw-lock */
 	/* The application can now release the rw-lock */
 	if (node >= 0)
 	if (node >= 0)
-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
 	else
 	else
 	{
 	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 				handle->per_node[i].refcnt--;
 				handle->per_node[i].refcnt--;
 		}
 		}
 		handle->busy_count--;
 		handle->busy_count--;
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, mode))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 }
 }
 
 
-void starpu_data_release(starpu_data_handle_t handle)
+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+{
+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
+}
+
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 {
 	int home_node = handle->home_node;
 	int home_node = handle->home_node;
 	if (home_node < 0)
 	if (home_node < 0)
 		home_node = STARPU_MAIN_RAM;
 		home_node = STARPU_MAIN_RAM;
-	starpu_data_release_on_node(handle, home_node);
+	starpu_data_release_to_on_node(handle, mode, home_node);
+}
+
+void starpu_data_release(starpu_data_handle_t handle)
+{
+	starpu_data_release_to(handle, STARPU_NONE);
 }
 }
 
 
 static void _prefetch_data_on_node(void *arg)
 static void _prefetch_data_on_node(void *arg)
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 		_starpu_data_acquire_wrapper_finished(wrapper);
 		_starpu_data_acquire_wrapper_finished(wrapper);
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* In case there was a temporary handle (eg. used for reduction), this
 		/* In case there was a temporary handle (eg. used for reduction), this
 		 * handle may have requested to be destroyed when the data is released
 		 * handle may have requested to be destroyed when the data is released
 		 * */
 		 * */
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	}
 	else if (!async)
 	else if (!async)
@@ -666,6 +686,9 @@ static void _starpu_data_wont_use(void *data)
 
 
 void starpu_data_wont_use(starpu_data_handle_t handle)
 void starpu_data_wont_use(starpu_data_handle_t handle)
 {
 {
+	if (!handle->initialized)
+		/* No value atm actually */
+		return;
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 }
 }

+ 1 - 1
src/datawizard/write_back.c

@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 
 
 	_starpu_spin_lock(&handle->header_lock);
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 

+ 2 - 2
src/debug/latency.c

@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node0, replicate_0);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
 
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 		handle->refcnt++;
 		handle->refcnt++;
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node1, replicate_1);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}
 	}
 }
 }

+ 19 - 13
src/debug/traces/starpu_fxt.c

@@ -333,7 +333,7 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 	return data;
 	return data;
 }
 }
 
 
-static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+static void handle_papi_event(struct fxt_ev_64 *ev STARPU_ATTRIBUTE_UNUSED, struct starpu_fxt_options *options STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef STARPU_PAPI
 #ifdef STARPU_PAPI
 	int event_code = ev->param[0];
 	int event_code = ev->param[0];
@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
 #else
 #else
-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
 #endif
 #endif
 		}
 		}
 
 
@@ -2636,10 +2638,12 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
                char paje_value[STARPU_POTI_STR_LEN];
                char paje_value[STARPU_POTI_STR_LEN];
                snprintf(paje_value, sizeof(paje_value), "%u", task);
                snprintf(paje_value, sizeof(paje_value), "%u", task);
                snprintf(container, sizeof(container), "%sp", options->file_prefix);
                snprintf(container, sizeof(container), "%sp", options->file_prefix);
-               poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
 #else
 #else
-	       fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
-               fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
+		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
+		if (!options->no_events)
+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
 #endif
 #endif
 	}
 	}
 
 
@@ -2652,9 +2656,9 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		if (options->file_rank < 0)
 		if (options->file_rank < 0)
-			fprintf(sched_tasks_file, "JobId: %d\n", task);
+			fprintf(sched_tasks_file, "JobId: %u\n", task);
 		else
 		else
-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
 		fprintf(sched_tasks_file, "\n");
 		fprintf(sched_tasks_file, "\n");
 	}
 	}
 }
 }
@@ -2681,11 +2685,13 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		char paje_value[STARPU_POTI_STR_LEN];
 		char paje_value[STARPU_POTI_STR_LEN];
 		snprintf(paje_value, sizeof(paje_value), "%u", task);
 		snprintf(paje_value, sizeof(paje_value), "%u", task);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
-		poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
 #else
 #else
 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
-		fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
+		if (!options->no_events)
+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
 #endif
 #endif
 	}
 	}
 
 
@@ -2701,9 +2707,9 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		if (options->file_rank < 0)
 		if (options->file_rank < 0)
-			fprintf(sched_tasks_file, "JobId: %d\n", task);
+			fprintf(sched_tasks_file, "JobId: %u\n", task);
 		else
 		else
-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
 		fprintf(sched_tasks_file, "\n");
 		fprintf(sched_tasks_file, "\n");
 	}
 	}
 }
 }
@@ -4450,7 +4456,7 @@ void _starpu_fxt_number_events_file_close(void)
 		for (i = 0; i <= FUT_SETUP_CODE; i++)
 		for (i = 0; i <= FUT_SETUP_CODE; i++)
 		{
 		{
 			if (number_events[i] > 0)
 			if (number_events[i] > 0)
-				fprintf(number_events_file, "0x%x\t%lu\n", i, number_events[i]);
+				fprintf(number_events_file, "0x%x\t%"PRIu64"\n", i, number_events[i]);
 		}
 		}
 
 
 		free(number_events);
 		free(number_events);

+ 1 - 0
src/debug/traces/starpu_fxt.h

@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_MemoryEvent;
 extern int _starpu_poti_MemoryEvent;
+extern int _starpu_poti_CommLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 
 
 /*
 /*

+ 8 - 6
src/debug/traces/starpu_fxt_mpi.c

@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
 
 
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 #else
 #else
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 #endif
 #endif
 
 
@@ -354,10 +356,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 				fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
 				fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
 				fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
 				fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
 				if (cur->jobid != -1)
 				if (cur->jobid != -1)
-					fprintf(out_comms_file, "SendJobId: %d_%lu\n", src, cur->jobid);
+					fprintf(out_comms_file, "SendJobId: %d_%ld\n", src, cur->jobid);
 				if (match->jobid != -1)
 				if (match->jobid != -1)
-					fprintf(out_comms_file, "RecvJobId: %d_%lu\n", dst, match->jobid);
-				fprintf(out_comms_file, "Size: %ld\n", size);
+					fprintf(out_comms_file, "RecvJobId: %d_%ld\n", dst, match->jobid);
+				fprintf(out_comms_file, "Size: %lu\n", (unsigned long)size);
 				fprintf(out_comms_file, "\n");
 				fprintf(out_comms_file, "\n");
 			}
 			}
 		}
 		}
@@ -372,7 +374,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 	if (nb_wrong_comm_timing == 1)
 	if (nb_wrong_comm_timing == 1)
 		_STARPU_MSG("Warning: a communication finished before it started !\n");
 		_STARPU_MSG("Warning: a communication finished before it started !\n");
 	else if (nb_wrong_comm_timing > 1)
 	else if (nb_wrong_comm_timing > 1)
-		_STARPU_MSG("Warning: %d communications finished before they started !\n", nb_wrong_comm_timing);
+		_STARPU_MSG("Warning: %u communications finished before they started !\n", nb_wrong_comm_timing);
 }
 }
 
 
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)

+ 7 - 4
src/debug/traces/starpu_paje.c

@@ -28,6 +28,7 @@
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_MemoryEvent = -1;
 int _starpu_poti_MemoryEvent = -1;
+int _starpu_poti_CommLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 #endif
 #endif
 #endif
 #endif
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 						     "SubmitOrder string"
 						     "SubmitOrder string"
 						     );
 						     );
 #ifdef HAVE_POTI_USER_NEWEVENT
 #ifdef HAVE_POTI_USER_NEWEVENT
+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
 	if (options->memory_states)
 	if (options->memory_states)
 	{
 	{
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 							     "Size string",
 							     "Size string",
 							     "Dest string");
 							     "Dest string");
 	}
 	}
-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
 #endif
 #endif
 #else
 #else
 	poti_header(1,1);
 	poti_header(1,1);
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Value	string\n");
 	fprintf(file, "%%	Value	string\n");
-	fprintf(file, "%%	EndContainer	string\n");
+	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	Key	string\n");
-	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif
 #endif
 
 

+ 30 - 25
src/sched_policies/component_work_stealing.c

@@ -30,14 +30,20 @@
 #warning TODO: locality work-stealing
 #warning TODO: locality work-stealing
 #endif
 #endif
 
 
+struct _starpu_component_work_stealing_data_per_worker
+{
+	struct _starpu_prio_deque fifo;
+	unsigned last_pop_child;
+};
+
 struct _starpu_component_work_stealing_data
 struct _starpu_component_work_stealing_data
 {
 {
 /* keep track of the work performed from the beginning of the algorithm to make
 /* keep track of the work performed from the beginning of the algorithm to make
  * better decisions about which queue to child when stealing or deferring work
  * better decisions about which queue to child when stealing or deferring work
  */
  */
-	unsigned performed_total, last_pop_child, last_push_child;
+	struct _starpu_component_work_stealing_data_per_worker *per_worker;
+	unsigned performed_total, last_push_child;
 
 
-	struct _starpu_prio_deque * fifos;
 	starpu_pthread_mutex_t ** mutexes;
 	starpu_pthread_mutex_t ** mutexes;
 	unsigned size;
 	unsigned size;
 };
 };
@@ -50,16 +56,14 @@ struct _starpu_component_work_stealing_data
 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
 {
 {
 	struct _starpu_component_work_stealing_data *wsd = component->data;
 	struct _starpu_component_work_stealing_data *wsd = component->data;
-	STARPU_HG_DISABLE_CHECKING(wsd->last_pop_child);
-	unsigned i = wsd->last_pop_child;
-	wsd->last_pop_child = (i + 1) % component->nchildren;
-	STARPU_HG_ENABLE_CHECKING(wsd->last_pop_child);
+	unsigned i = wsd->per_worker[workerid].last_pop_child;
+	wsd->per_worker[workerid].last_pop_child = (i + 1) % component->nchildren;
 	/* If the worker's queue have no suitable tasks, let's try
 	/* If the worker's queue have no suitable tasks, let's try
 	 * the next ones */
 	 * the next ones */
 	struct starpu_task * task = NULL;
 	struct starpu_task * task = NULL;
 	while (1)
 	while (1)
 	{
 	{
-		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
+		struct _starpu_prio_deque * fifo = &wsd->per_worker[i].fifo;
 
 
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
@@ -75,7 +79,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 			break;
 			break;
 		}
 		}
 
 
-		if (i == wsd->last_pop_child)
+		if (i == wsd->per_worker[workerid].last_pop_child)
 		{
 		{
 			/* We got back to the first worker,
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
 			 * don't go in infinite loop */
@@ -141,17 +145,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	const double now = starpu_timing_now();
 	const double now = starpu_timing_now();
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->per_worker[i].fifo);
 	if(task)
 	if(task)
 	{
 	{
 		if(!isnan(task->predicted))
 		if(!isnan(task->predicted))
 		{
 		{
-			wsd->fifos[i].exp_len -= task->predicted;
-			wsd->fifos[i].exp_start = now + task->predicted;
+			wsd->per_worker[i].fifo.exp_len -= task->predicted;
+			wsd->per_worker[i].fifo.exp_start = now + task->predicted;
 		}
 		}
 	}
 	}
 	else
 	else
-		wsd->fifos[i].exp_len = 0.0;
+		wsd->per_worker[i].fifo.exp_len = 0.0;
 
 
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	if(task)
 	if(task)
@@ -163,7 +167,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	if(task)
 	if(task)
 	{
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		wsd->fifos[i].nprocessed++;
+		wsd->per_worker[i].fifo.nprocessed++;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 
 		return task;
 		return task;
@@ -196,9 +200,9 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	for(i = 0; i < component->nchildren; i++)
 	{
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		sum_len += wsd->fifos[i].exp_len;
-		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
-		sum_start += wsd->fifos[i].exp_start;
+		sum_len += wsd->per_worker[i].fifo.exp_len;
+		wsd->per_worker[i].fifo.exp_start = STARPU_MAX(now, wsd->per_worker[i].fifo.exp_start);
+		sum_start += wsd->per_worker[i].fifo.exp_start;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 
 	}
 	}
@@ -216,7 +220,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	for(i = 0; i < component->nchildren; i++)
 	{
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		ntasks += wsd->fifos[i].ntasks;
+		ntasks += wsd->per_worker[i].fifo.ntasks;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	}
 	}
 	double speedup = 0.0;
 	double speedup = 0.0;
@@ -265,7 +269,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	starpu_sched_task_break(task);
 	starpu_sched_task_break(task);
-	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
+	ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo, task);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 
 	wsd->last_push_child = i;
 	wsd->last_push_child = i;
@@ -308,9 +312,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
 
 			struct _starpu_component_work_stealing_data * wsd = component->data;
 			struct _starpu_component_work_stealing_data * wsd = component->data;
 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
+			int ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo , task);
 			if(ret == 0 && !isnan(task->predicted))
 			if(ret == 0 && !isnan(task->predicted))
-				wsd->fifos[i].exp_len += task->predicted;
+				wsd->per_worker[i].fifo.exp_len += task->predicted;
 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 
 			component->can_pull(component);
 			component->can_pull(component);
@@ -329,12 +333,13 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 	if(wsd->size < component->nchildren)
 	if(wsd->size < component->nchildren)
 	{
 	{
 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
-		_STARPU_REALLOC(wsd->fifos, component->nchildren * sizeof(*wsd->fifos));
+		_STARPU_REALLOC(wsd->per_worker, component->nchildren * sizeof(*wsd->per_worker));
 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
 		wsd->size = component->nchildren;
 		wsd->size = component->nchildren;
 	}
 	}
 
 
-	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
+	wsd->per_worker[component->nchildren - 1].last_pop_child = 0;
+	_starpu_prio_deque_init(&wsd->per_worker[component->nchildren - 1].fifo);
 
 
 	starpu_pthread_mutex_t *mutex;
 	starpu_pthread_mutex_t *mutex;
 	_STARPU_MALLOC(mutex, sizeof(*mutex));
 	_STARPU_MALLOC(mutex, sizeof(*mutex));
@@ -356,8 +361,8 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 			break;
 			break;
 	}
 	}
 	STARPU_ASSERT(i_component != component->nchildren);
 	STARPU_ASSERT(i_component != component->nchildren);
-	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
-	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
+	struct _starpu_prio_deque tmp_fifo = wsd->per_worker[i_component].fifo;
+	wsd->per_worker[i_component].fifo = wsd->per_worker[component->nchildren - 1].fifo;
 
 
 
 
 	component->children[i_component] = component->children[component->nchildren - 1];
 	component->children[i_component] = component->children[component->nchildren - 1];
@@ -372,7 +377,7 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
 {
 {
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	struct _starpu_component_work_stealing_data * wsd = component->data;
-	free(wsd->fifos);
+	free(wsd->per_worker);
 	free(wsd->mutexes);
 	free(wsd->mutexes);
 	free(wsd);
 	free(wsd);
 }
 }

+ 68 - 22
src/sched_policies/work_stealing_policy.c

@@ -71,10 +71,21 @@ struct locality_entry
 
 
 struct _starpu_work_stealing_data_per_worker
 struct _starpu_work_stealing_data_per_worker
 {
 {
+	char fill1[STARPU_CACHELINE_SIZE];
+	/* This is read-mostly, only updated when the queue becomes empty or
+	 * becomes non-empty, to make it generally cheap to check */
+	unsigned notask;	/* whether the queue is empty */
+	char fill2[STARPU_CACHELINE_SIZE];
+
 	struct _starpu_prio_deque queue;
 	struct _starpu_prio_deque queue;
 	int running;
 	int running;
 	int *proxlist;
 	int *proxlist;
-	int busy;
+	int busy;	/* Whether this worker is working on a task */
+
+	/* keep track of the work performed from the beginning of the algorithm to make
+	 * better decisions about which queue to select when deferring work
+	 */
+	unsigned last_pop_worker;
 
 
 #ifdef USE_LOCALITY_TASKS
 #ifdef USE_LOCALITY_TASKS
 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
@@ -93,9 +104,8 @@ struct _starpu_work_stealing_data
 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
 	struct _starpu_work_stealing_data_per_worker *per_worker;
 	struct _starpu_work_stealing_data_per_worker *per_worker;
 	/* keep track of the work performed from the beginning of the algorithm to make
 	/* keep track of the work performed from the beginning of the algorithm to make
-	 * better decisions about which queue to select when stealing or deferring work
+	 * better decisions about which queue to select when deferring work
 	 */
 	 */
-	unsigned last_pop_worker;
 	unsigned last_push_worker;
 	unsigned last_push_worker;
 };
 };
 
 
@@ -118,7 +128,8 @@ static int calibration_value = 0;
  */
  */
 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
 {
 {
-	unsigned worker = ws->last_pop_worker;
+	unsigned workerid = starpu_worker_get_id_check();
+	unsigned worker = ws->per_worker[workerid].last_pop_worker;
 	unsigned nworkers;
 	unsigned nworkers;
 	int *workerids = NULL;
 	int *workerids = NULL;
 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
@@ -131,14 +142,17 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		/* Here helgrind would shout that this is unprotected, but we
 		/* Here helgrind would shout that this is unprotected, but we
 		 * are fine with getting outdated values, this is just an
 		 * are fine with getting outdated values, this is just an
 		 * estimation */
 		 * estimation */
-		ntasks = ws->per_worker[workerids[worker]].queue.ntasks;
-
-		if (ntasks && (ws->per_worker[workerids[worker]].busy
-					   || starpu_worker_is_blocked_in_parallel(workerids[worker])))
-			break;
+		if (!ws->per_worker[workerids[worker]].notask)
+		{
+			if (ws->per_worker[workerids[worker]].busy
+						   || starpu_worker_is_blocked_in_parallel(workerids[worker])) {
+				ntasks = 1;
+				break;
+			}
+		}
 
 
 		worker = (worker + 1) % nworkers;
 		worker = (worker + 1) % nworkers;
-		if (worker == ws->last_pop_worker)
+		if (worker == ws->per_worker[workerid].last_pop_worker)
 		{
 		{
 			/* We got back to the first worker,
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
 			 * don't go in infinite loop */
@@ -147,7 +161,7 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		}
 		}
 	}
 	}
 
 
-	ws->last_pop_worker = (worker + 1) % nworkers;
+	ws->per_worker[workerid].last_pop_worker = (worker + 1) % nworkers;
 
 
 	worker = workerids[worker];
 	worker = workerids[worker];
 
 
@@ -327,15 +341,31 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 	{
 	{
 		/* found an interesting task, try to pick it! */
 		/* found an interesting task, try to pick it! */
 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
+		{
+			if (!data_source->queue.ntasks)
+			{
+				STARPU_ASSERT(ws->per_worker[source].notask == 0);
+				ws->per_worker[source].notask = 1;
+			}
 			return best_task;
 			return best_task;
+		}
 	}
 	}
 
 
 	/* Didn't find an interesting task, or couldn't run it :( */
 	/* Didn't find an interesting task, or couldn't run it :( */
 	int skipped;
 	int skipped;
+	struct starpu_task *task;
+
 	if (source != target)
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
 	else
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+
+	if (task && !data_source->queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 }
 
 
 /* Called when popping a task from a queue */
 /* Called when popping a task from a queue */
@@ -371,10 +401,18 @@ static void locality_pushed_task(struct _starpu_work_stealing_data *ws STARPU_AT
 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
 {
 {
 	int skipped;
 	int skipped;
+	struct starpu_task *task;
 	if (source != target)
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
 	else
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+
+	if (task && !ws->per_worker[source].queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 }
 /* Called when popping a task from a queue */
 /* Called when popping a task from a queue */
 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
@@ -530,7 +568,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	struct starpu_task *task = NULL;
 	struct starpu_task *task = NULL;
 	unsigned workerid = starpu_worker_get_id_check();
 	unsigned workerid = starpu_worker_get_id_check();
 
 
-	ws->per_worker[workerid].busy = 0;
+	if (ws->per_worker[workerid].busy)
+		ws->per_worker[workerid].busy = 0;
 
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
@@ -617,7 +656,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 		if (!task)
 		if (!task)
 			return NULL;
 			return NULL;
 	}
 	}
-	ws->per_worker[workerid].busy = !!task;
+	if (ws->per_worker[workerid].busy != !!task)
+		ws->per_worker[workerid].busy = !!task;
 	return task;
 	return task;
 }
 }
 
 
@@ -648,6 +688,11 @@ int ws_push_task(struct starpu_task *task)
 	record_data_locality(task, workerid);
 	record_data_locality(task, workerid);
 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
+	if (ws->per_worker[workerid].queue.ntasks == 1)
+	{
+		STARPU_ASSERT(ws->per_worker[workerid].notask == 1);
+		ws->per_worker[workerid].notask = 0;
+	}
 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
 
 
 	starpu_push_task_end(task);
 	starpu_push_task_end(task);
@@ -676,10 +721,12 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 		int workerid = workerids[i];
 		int workerid = workerids[i];
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
+		ws->per_worker[workerid].notask = 1;
 		ws->per_worker[workerid].running = 1;
 		ws->per_worker[workerid].running = 1;
 
 
 		/* Tell helgrind that we are fine with getting outdated values,
 		/* Tell helgrind that we are fine with getting outdated values,
 		 * this is just an estimation */
 		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].notask);
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
 		ws->per_worker[workerid].busy = 0;
 		ws->per_worker[workerid].busy = 0;
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
@@ -708,9 +755,7 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
 
 
-	ws->last_pop_worker = 0;
 	ws->last_push_worker = 0;
 	ws->last_push_worker = 0;
-	STARPU_HG_DISABLE_CHECKING(ws->last_pop_worker);
 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
 	ws->select_victim = select_victim;
 	ws->select_victim = select_victim;
 
 
@@ -760,11 +805,12 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 	for (i = 0; i < nworkers; i++)
 	for (i = 0; i < nworkers; i++)
 	{
 	{
 		int neighbor = ws->per_worker[workerid].proxlist[i];
 		int neighbor = ws->per_worker[workerid].proxlist[i];
+		if (ws->per_worker[neighbor].notask)
+			continue;
                 /* FIXME: do not keep looking again and again at some worker
                 /* FIXME: do not keep looking again and again at some worker
                  * which has tasks, but that can't execute on me */
                  * which has tasks, but that can't execute on me */
-		int ntasks = ws->per_worker[neighbor].queue.ntasks;
-		if (ntasks && (ws->per_worker[neighbor].busy
-					   || starpu_worker_is_blocked_in_parallel(neighbor)))
+		if (ws->per_worker[neighbor].busy
+					   || starpu_worker_is_blocked_in_parallel(neighbor))
 			return neighbor;
 			return neighbor;
 	}
 	}
 	return -1;
 	return -1;

+ 1 - 2
src/util/execute_on_all.c

@@ -107,8 +107,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 
 
-	/* This method only work on CPU, CUDA, OPENCL */
-	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
+	STARPU_ASSERT_MSG((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0, "This function is implemented only on CPU, CUDA, OpenCL");
 
 
 	/* create a wrapper codelet */
 	/* create a wrapper codelet */
 	struct starpu_codelet wrapper_cl =
 	struct starpu_codelet wrapper_cl =

+ 1 - 0
src/util/starpu_data_cpy.c

@@ -159,6 +159,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	task->callback_func = callback_func;
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 	task->callback_arg = callback_arg;
 
 
+	/* FIXME: priority!! */
 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 
 

+ 18 - 1
tests/Makefile.am

@@ -43,6 +43,7 @@ EXTRA_DIST =					\
 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
+	microbenchs/bandwidth_scheds.sh		\
 	energy/static.sh			\
 	energy/static.sh			\
 	energy/dynamic.sh			\
 	energy/dynamic.sh			\
 	energy/perfs.gp				\
 	energy/perfs.gp				\
@@ -73,7 +74,7 @@ EXTRA_DIST =					\
 	model-checking/starpu-mc.sh.in
 	model-checking/starpu-mc.sh.in
 
 
 CLEANFILES = 					\
 CLEANFILES = 					\
-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90
+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat bandwidth.gp bandwidth.eps bandwidth.svg
 
 
 BUILT_SOURCES =
 BUILT_SOURCES =
 SUBDIRS =
 SUBDIRS =
@@ -268,6 +269,7 @@ myPROGRAMS +=				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
 	datawizard/acquire_release2		\
+	datawizard/acquire_release_to		\
 	datawizard/acquire_try			\
 	datawizard/acquire_try			\
 	datawizard/bcsr				\
 	datawizard/bcsr				\
 	datawizard/cache			\
 	datawizard/cache			\
@@ -355,6 +357,7 @@ myPROGRAMS +=				\
 	microbenchs/prefetch_data_on_node 	\
 	microbenchs/prefetch_data_on_node 	\
 	microbenchs/redundant_buffer		\
 	microbenchs/redundant_buffer		\
 	microbenchs/matrix_as_vector		\
 	microbenchs/matrix_as_vector		\
+	microbenchs/bandwidth			\
 	overlap/gpu_concurrency			\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels		\
@@ -431,6 +434,8 @@ examplebin_SCRIPTS = \
 	microbenchs/tasks_size_overhead.sh
 	microbenchs/tasks_size_overhead.sh
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 if !STARPU_USE_MPI_MASTER_SLAVE
 if !STARPU_USE_MPI_MASTER_SLAVE
+examplebin_PROGRAMS += \
+	microbenchs/bandwidth
 SHELL_TESTS += \
 SHELL_TESTS += \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \
@@ -454,6 +459,7 @@ endif
 if !STARPU_USE_MPI_MASTER_SLAVE
 if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS += \
 SHELL_TESTS += \
 	datawizard/locality.sh \
 	datawizard/locality.sh \
+	microbenchs/bandwidth_scheds.sh \
 	overlap/overlap.sh
 	overlap/overlap.sh
 endif
 endif
 
 
@@ -505,6 +511,17 @@ datawizard_acquire_release2_SOURCES +=		\
 	datawizard/acquire_release_opencl.c
 	datawizard/acquire_release_opencl.c
 endif
 endif
 
 
+datawizard_acquire_release_to_SOURCES =		\
+	datawizard/acquire_release_to.c
+if STARPU_USE_CUDA
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_opencl.c
+endif
+
 datawizard_scratch_SOURCES =			\
 datawizard_scratch_SOURCES =			\
 	datawizard/scratch.c
 	datawizard/scratch.c
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA

+ 214 - 0
tests/datawizard/acquire_release_to.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Check that _release_to correctly interacts with tasks working on the same data
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 10;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned ntasks = 1000;
+#else
+static unsigned ntasks = 10000;
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void increment_opencl(void *buffers[], void *args);
+#endif
+
+void increment_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {increment_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+void check_cpu(void *descr[], void *arg)
+{
+	unsigned *val = arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	STARPU_ASSERT(*tokenptr == *val);
+}
+
+static struct starpu_codelet check_cl =
+{
+	.modes = { STARPU_R },
+	.cpu_funcs = {check_cpu},
+	.cpu_funcs_name = {"check_cpu"},
+	.nbuffers = 1
+};
+
+unsigned token = 0;
+starpu_data_handle_t token_handle;
+
+static
+int increment_token(void)
+{
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+int check_token(unsigned value)
+{
+	unsigned *value_p;
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &check_cl;
+	task->handles[0] = token_handle;
+	task->cl_arg = value_p = malloc(sizeof(*value_p));
+	task->cl_arg_size = sizeof(*value_p);
+	task->cl_arg_free = 1;
+	*value_p = value;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+void callback(void *arg)
+{
+	(void)arg;
+	token++;
+	starpu_data_release_to(token_handle, STARPU_W);
+	starpu_sleep(0.001);
+	starpu_data_release_to(token_handle, STARPU_R);
+	starpu_sleep(0.001);
+	starpu_data_release(token_handle);
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+int main(int argc, char **argv)
+{
+	unsigned i;
+	int ret;
+
+        ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
+
+        FPRINTF(stderr, "Token: %u\n", token);
+
+	for(i=0; i<ntasks; i++)
+	{
+		/* synchronize data in RAM */
+                ret = starpu_data_acquire(token_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+                token ++;
+
+		ret = check_token(4*i+1);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+2);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_W);
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_R);
+
+		starpu_sleep(0.001);
+		starpu_data_release(token_handle);
+
+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+		ret = check_token(4*i+3);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+4);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	}
+
+	starpu_data_unregister(token_handle);
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks * 4)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	return ret;
+
+enodev_release:
+	starpu_data_release(token_handle);
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 3 - 3
tests/datawizard/bcsr.c

@@ -39,20 +39,20 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
-	printf("\nnnz %d elemsize %d\n", nnz, elemsize);
+	printf("\nnnz %u elemsize %u\n", nnz, elemsize);
 
 
 	for (i = 0; i < nrow; i++)
 	for (i = 0; i < nrow; i++)
 	{
 	{
 		uint32_t row_start = rowptr[i] - firstentry;
 		uint32_t row_start = rowptr[i] - firstentry;
 		uint32_t row_end = rowptr[i+1] - firstentry;
 		uint32_t row_end = rowptr[i+1] - firstentry;
 
 
-		printf("row %d\n", i);
+		printf("row %u\n", i);
 
 
 		for (j = row_start; j < row_end; j++)
 		for (j = row_start; j < row_end; j++)
 		{
 		{
 			int *block = nzval + j * r*c;
 			int *block = nzval + j * r*c;
 
 
-			printf( " column %d\n", colind[j]);
+			printf( " column %u\n", colind[j]);
 
 
 			for (y = 0; y < r; y++)
 			for (y = 0; y < r; y++)
 			{
 			{

+ 3 - 3
tests/datawizard/interfaces/block/block_opencl.c

@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 	}
 	}
 			
 			
 	{
 	{
-		size_t global = nx * ny * nz;
+		size_t global[3] = {nx, ny, nz};
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     NULL,
 					     0,
 					     0,
 					     NULL,
 					     NULL,

+ 15 - 23
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 			   int ldy, int ldz,
 			   int ldy, int ldz,
 			   int factor, __global int *err)
 			   int factor, __global int *err)
 {
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
 		return;
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	int val = idz*ny*nx+idy*nx+idx;
+	int i = (idz*ldz)+(idy*ldy)+idx;
 
 
-	unsigned int i, j, k;
-	int val = 0;
-	for (k = 0; k < nz; k++)
-	{
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					block[(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	}
+	if (block[i] != factor * val)
+		*err = 1;
+	else
+		block[i] *= -1;
 }
 }

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_interface.c

@@ -18,7 +18,7 @@
 #include "../test_interfaces.h"
 #include "../test_interfaces.h"
 #include "../../../helper.h"
 #include "../../../helper.h"
 
 
-#define NX 16
+#define NX 4
 #define NY NX
 #define NY NX
 #define NZ NX
 #define NZ NX
 #define NT NX
 #define NT NX

+ 3 - 3
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 	}
 			
 			
 	{
 	{
-                size_t global = 1;
+		size_t global[3] = {nx, ny, nz*nt};
 		err = clEnqueueNDRangeKernel(queue,
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     NULL,
 					     0,
 					     0,
 					     NULL,
 					     NULL,

+ 18 - 26
tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl

@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 			   int ldy, int ldz, int ldt,
 			   int ldy, int ldz, int ldt,
 			   int factor, __global int *err)
 			   int factor, __global int *err)
 {
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2) % nz;
+	const int idt = get_global_id(2) / nz;
+	if (idx >= nx)
 		return;
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+	if (idt >= nt)
+		return;
+
+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
 
 
-	unsigned int i, j, k, l;
-	int val = 0;
-	for (l = 0; l < nt; l++)
-	{
-	    for (k = 0; k < nz; k++)
-	    {
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	    }
-	}
+	if (tensor[i] != factor * val)
+		*err = 1;
+	else
+		tensor[i] *= -1;
 }
 }

+ 8 - 0
tests/energy/energy_efficiency.c

@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
 
 	unsigned N, k, m, n, iter, NITER;
 	unsigned N, k, m, n, iter, NITER;
 	if (argc < 2)
 	if (argc < 2)
+#ifdef STARPU_QUICK_CHECK
+		N = 10;
+#else
 		N = 40;
 		N = 40;
+#endif
 	else
 	else
 		N = atoi(argv[1]);
 		N = atoi(argv[1]);
 	if (argc < 3)
 	if (argc < 3)
+#ifdef STARPU_QUICK_CHECK
+		NITER = 3;
+#else
 		NITER = 10;
 		NITER = 10;
+#endif
 	else
 	else
 		NITER = atoi(argv[2]);
 		NITER = atoi(argv[2]);
 	if (N == 0)
 	if (N == 0)

+ 4 - 4
tests/helper/starpu_data_dup_ro.c

@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 	ret = EXIT_SUCCESS;
 	ret = EXIT_SUCCESS;
 	if (*var != 42)
 	if (*var != 42)
 	{
 	{
-	     FPRINTF(stderr, "var2 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var2 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	     ret = EXIT_FAILURE;
 	}
 	}
 	starpu_data_release(var2_handle);
 	starpu_data_release(var2_handle);
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var3_handle);
 	var = starpu_data_get_local_ptr(var3_handle);
 	if (*var != 42)
 	if (*var != 42)
 	{
 	{
-	     FPRINTF(stderr, "var3 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var3 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	     ret = EXIT_FAILURE;
 	}
 	}
 	starpu_data_release(var3_handle);
 	starpu_data_release(var3_handle);
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var4_handle);
 	var = starpu_data_get_local_ptr(var4_handle);
 	if (*var != 42)
 	if (*var != 42)
 	{
 	{
-	     FPRINTF(stderr, "var4 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var4 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	     ret = EXIT_FAILURE;
 	}
 	}
 	starpu_data_release(var4_handle);
 	starpu_data_release(var4_handle);
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var5_handle);
 	var = starpu_data_get_local_ptr(var5_handle);
 	if (*var != 43)
 	if (*var != 43)
 	{
 	{
-	     FPRINTF(stderr, "var5 is %d but it should be %d\n", *var, 43);
+	     FPRINTF(stderr, "var5 is %u but it should be %d\n", *var, 43);
 	     ret = EXIT_FAILURE;
 	     ret = EXIT_FAILURE;
 	}
 	}
 	starpu_data_release(var5_handle);
 	starpu_data_release(var5_handle);

+ 343 - 0
tests/microbenchs/bandwidth.c

@@ -0,0 +1,343 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Measure the memory bandwidth available to kernels depending on the number of
+ * kernels and number of idle workers.
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static size_t size = 1024;
+static unsigned cpustep = 4;
+#else
+/* Must be bigger than available cache size per core, 64MiB should be enough */
+static size_t size = 64UL << 20;
+static unsigned cpustep = 1;
+#endif
+
+static unsigned noalone = 0;
+static unsigned iter = 30;
+static unsigned total_ncpus;
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
+static float *result;
+static void **buffers;	/* Indexed by logical core number */
+static char padding1[STARPU_CACHELINE_SIZE];
+static volatile char finished;
+static char padding2[STARPU_CACHELINE_SIZE];
+
+static unsigned interleave(unsigned i);
+
+/* Initialize the buffer locally */
+void initialize_buffer(void *foo)
+{
+	unsigned id = starpu_worker_get_id();
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
+	STARPU_ASSERT(ret == 0);
+#else
+	buffers[id] = malloc(2*size);
+#endif
+	memset(buffers[id], 0, 2*size);
+}
+
+/* Actual transfer codelet */
+void bw_func(void *descr[], void *arg)
+{
+	int id = (uintptr_t) arg;
+	void *src = buffers[id];
+	void *dst = (void*) ((uintptr_t)src + size);
+	unsigned i;
+	double start, stop;
+
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	start = starpu_timing_now();
+	for (i = 0; i < iter; i++)
+	{
+		memcpy(dst, src, size);
+		STARPU_SYNCHRONIZE();
+	}
+	stop = starpu_timing_now();
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
+	finished = 1;
+
+	result[id] = (size*iter) / (stop - start);
+}
+
+static struct starpu_codelet bw_codelet =
+{
+	.cpu_funcs = {bw_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
+void nop_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		unsigned i;
+		for (i = 0; i < 1000000; i++)
+			STARPU_UYIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet nop_codelet =
+{
+	.cpu_funcs = {nop_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while aggressively reading the finished variable. */
+void sync_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		STARPU_VALGRIND_YIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet sync_codelet =
+{
+	.cpu_funcs = {sync_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+static void usage(char **argv)
+{
+	fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
+	fprintf(stderr, "\t-n niter\tNumber of iterations\n");
+	fprintf(stderr, "\t-s size\tBuffer size in MB\n");
+	fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
+	fprintf(stderr, "\t-a Do not run the alone test\n");
+	exit(EXIT_FAILURE);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int c;
+	while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
+	switch(c)
+	{
+		case 'n':
+			iter = atoi(optarg);
+			break;
+		case 's':
+			size = (long)atoi(optarg) << 20;
+			break;
+		case 'c':
+			cpustep = atoi(optarg);
+			break;
+		case 'a':
+			noalone = 1;
+			break;
+		case 'h':
+			usage(argv);
+			break;
+	}
+}
+
+static unsigned interleave(unsigned i)
+{
+	/* TODO: rather distribute over hierarchy */
+	if (total_ncpus > 1)
+		return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
+	else
+		return 0;
+}
+
+enum sleep_type {
+	PAUSE,
+	NOP,
+	SYNC,
+	SCHED,
+};
+
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
+{
+	int ret;
+	unsigned i;
+	struct starpu_conf conf;
+	float bw;
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+	conf.ncpus = ncpus;
+
+	if (intl && sleep == PAUSE)
+	{
+		conf.use_explicit_workers_bindid = 1;
+		for (i = 0; i < ncpus; i++)
+			conf.workers_bindid[i] = interleave(i);
+	}
+
+	ret = starpu_initialize(&conf, argc, argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (sleep == PAUSE || sleep == SCHED)
+		/* In these cases we don't have a task on each cpu */
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
+	else
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
+
+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
+
+	finished = 0;
+	for (i = 0; i < ncpus; i++)
+		result[i] = NAN;
+
+	for (i = 0; i < nbusy; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &bw_codelet;
+
+		if (intl)
+			task->cl_arg = (void*) (uintptr_t) interleave(i);
+		else
+			task->cl_arg = (void*) (uintptr_t) i;
+
+		task->execute_on_a_specific_worker = 1;
+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
+			task->workerid = interleave(i);
+		else
+			task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	if (sleep != PAUSE && sleep != SCHED)
+	{
+		/* Add waiting tasks */
+		for ( ; i < ncpus; i++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			switch (sleep)
+			{
+			case NOP:
+				task->cl = &nop_codelet;
+				break;
+			case SYNC:
+				task->cl = &sync_codelet;
+				break;
+			default:
+				STARPU_ASSERT(0);
+			}
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = interleave(i);
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	for (bw = 0., i = 0; i < nbusy; i++)
+	{
+		if (intl)
+			bw += result[interleave(i)];
+		else
+			bw += result[i];
+	}
+	return bw;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned n;
+	struct starpu_conf conf;
+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
+
+	parse_args(argc, argv);
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	total_ncpus = starpu_cpu_worker_get_count();
+
+	buffers = malloc(total_ncpus * sizeof(*buffers));
+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
+	starpu_shutdown();
+
+	if (total_ncpus == 0)
+		return STARPU_TEST_SKIPPED;
+
+	result = malloc(total_ncpus * sizeof(result[0]));
+
+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
+	for (n = cpustep; n <= total_ncpus; n += cpustep)
+	{
+		if (noalone)
+		{
+			alone = 0.;
+			alone_int = 0.;
+			alone_int_nop = 0.;
+			alone_int_sync = 0.;
+		}
+		else
+		{
+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
+		}
+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+				n,
+				alone/1000,
+				sched/1000, sched*100/alone,
+				alone_int/1000,
+				alone_int_nop/1000,
+				alone_int_sync/1000,
+				sched_int/1000, sched_int*100/alone_int_nop);
+		fflush(stdout);
+	}
+
+	free(result);
+
+	for (n = 0; n < total_ncpus; n++)
+		free(buffers[n]);
+	free(buffers);
+
+	return EXIT_SUCCESS;
+}

+ 75 - 0
tests/microbenchs/bandwidth_scheds.sh

@@ -0,0 +1,75 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -e
+
+if [ -n "$STARPU_SCHED" ]
+then
+	SCHEDS=$STARPU_SCHED
+	DEFAULT=$STARPU_SCHED
+else
+	SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
+	DEFAULT=eager
+fi
+
+if [ -n "$STARPU_BENCH_DIR" ]; then
+	cat > bandwidth.gp << EOF
+set term svg font ",12" size 1500,500 linewidth 0.5
+set output "bandwidth.svg"
+set pointsize 0.3
+EOF
+else
+	fast="-n 3 -c 4"
+	cat > bandwidth.gp << EOF
+set term postscript eps enhanced color font ",18"
+set output "bandwidth.eps"
+set size 2,1
+EOF
+fi
+
+cat >> bandwidth.gp << EOF
+set key outside
+set ylabel "GB/s"
+set xlabel "ncores"
+
+plot \\
+	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
+	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone contiguous", \\
+EOF
+
+type=1
+for sched in $SCHEDS
+do
+	if [ "$sched" != eager -a "$sched" != "$SCHEDS" ]; then
+		extra=-a
+	else
+		extra=
+	fi
+
+	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $fast $extra "$@" | tee bandwidth-$sched.dat
+	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
+	type=$((type+1))
+done
+
+if gnuplot bandwidth.gp ; then
+	if [ -n "$STARPU_BENCH_DIR" ]; then
+		cp bandwidth.svg $STARPU_BENCH_DIR/
+	fi
+fi

+ 1 - 0
tests/overlap/overlap.c

@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 	if (!finished)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 1 - 1
tests/parallel_tasks/explicit_combined_worker.c

@@ -114,7 +114,7 @@ int main(void)
 enodev:
 enodev:
 	starpu_data_unregister(v_handle);
 	starpu_data_unregister(v_handle);
 	starpu_free(v);
 	starpu_free(v);
-	fprintf(stderr, "WARNING: No one can execute the task on workerid %d\n", worker);
+	fprintf(stderr, "WARNING: No one can execute the task on workerid %u\n", worker);
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
 	starpu_shutdown();
 	starpu_shutdown();

+ 1 - 0
tools/Makefile.am

@@ -285,6 +285,7 @@ EXTRA_DIST =				\
 	dev/cppcheck/suppressions.txt	\
 	dev/cppcheck/suppressions.txt	\
 	dev/valgrind/bash.suppr		\
 	dev/valgrind/bash.suppr		\
 	dev/valgrind/fxt.suppr		\
 	dev/valgrind/fxt.suppr		\
+	dev/valgrind/glpk.suppr		\
 	dev/valgrind/hdf5.suppr		\
 	dev/valgrind/hdf5.suppr		\
 	dev/valgrind/hwloc.suppr	\
 	dev/valgrind/hwloc.suppr	\
 	dev/valgrind/libc.suppr		\
 	dev/valgrind/libc.suppr		\

+ 23 - 0
tools/dev/valgrind/glpk.suppr

@@ -0,0 +1,23 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   ...
+   fun:glp_init_env
+}

+ 1 - 2
tools/dev/valgrind/libc.suppr

@@ -263,8 +263,7 @@
    Memcheck:Leak
    Memcheck:Leak
    match-leak-kinds: reachable
    match-leak-kinds: reachable
    fun:malloc
    fun:malloc
-   fun:_dl_close_worker
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_close
    fun:_dl_catch_exception
    fun:_dl_catch_exception
    fun:_dl_catch_error
    fun:_dl_catch_error

+ 1 - 1
tools/dev/valgrind/padico.suppr

@@ -110,7 +110,7 @@
    Memcheck:Leak
    Memcheck:Leak
    match-leak-kinds: reachable
    match-leak-kinds: reachable
    fun:malloc
    fun:malloc
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_close
    fun:_dl_catch_error
    fun:_dl_catch_error
    fun:dlerror_run
    fun:dlerror_run

+ 95 - 65
tools/gdbinit

@@ -39,33 +39,6 @@ define starpu-print-task
   set $task = (struct starpu_task *)$arg0
   set $task = (struct starpu_task *)$arg0
   set $job = (struct _starpu_job *)$task->starpu_private
   set $job = (struct _starpu_job *)$task->starpu_private
   set $status=0
   set $status=0
-  if $task->status == 0
-    set $status="STARPU_TASK_INIT"
-  end
-  if $task->status == 1
-    set $status="STARPU_TASK_BLOCKED"
-  end
-  if $task->status == 2
-    set $status="STARPU_TASK_READY"
-  end
-  if $task->status == 3
-    set $status="STARPU_TASK_RUNNING"
-  end
-  if $task->status == 4
-    set $status="STARPU_TASK_FINISHED"
-  end
-  if $task->status == 5
-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
-  end
-  if $task->status == 6
-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
-  end
-  if $task->status == 7
-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
-  end
-  if $task->status == 8
-    set $status="STARPU_TASK_STOPPED"
-  end
 
 
   printf "StarPU Task (%p)\n", $task
   printf "StarPU Task (%p)\n", $task
   if $task->name
   if $task->name
@@ -81,13 +54,42 @@ define starpu-print-task
   end
   end
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
-  printf "\tstatus:\t\t\t\t<%s>\n", $status
+  printf "\tstatus:\t\t\t\t"
+  if $task->status == 0
+    printf "STARPU_TASK_INIT"
+  end
+  if $task->status == 1
+    printf "STARPU_TASK_BLOCKED"
+  end
+  if $task->status == 2
+    printf "STARPU_TASK_READY"
+  end
+  if $task->status == 3
+    printf "STARPU_TASK_RUNNING"
+  end
+  if $task->status == 4
+    printf "STARPU_TASK_FINISHED"
+  end
+  if $task->status == 5
+    printf "STARPU_TASK_BLOCKED_ON_TAG"
+  end
+  if $task->status == 6
+    printf "STARPU_TASK_BLOCKED_ON_TASK"
+  end
+  if $task->status == 7
+    printf "STARPU_TASK_BLOCKED_ON_DATA"
+  end
+  if $task->status == 8
+    printf "STARPU_TASK_STOPPED"
+  end
+  printf "\n"
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
@@ -169,35 +171,36 @@ define starpu-workers
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   while $num<_starpu_config->topology->nworkers
   while $num<_starpu_config->topology->nworkers
     set $worker=&_starpu_config->workers[$num]
     set $worker=&_starpu_config->workers[$num]
+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
     if $worker->status == STATUS_INVALID
     if $worker->status == STATUS_INVALID
-      set $status="INVALID"
+      printf "INVALID"
     end
     end
     if $worker->status == STATUS_UNKNOWN
     if $worker->status == STATUS_UNKNOWN
-      set $status="UNKNOWN"
+      printf "UNKNOWN"
     end
     end
     if $worker->status == STATUS_INITIALIZING
     if $worker->status == STATUS_INITIALIZING
-      set $status="INITIALIZING"
+      printf "INITIALIZING"
     end
     end
     if $worker->status == STATUS_EXECUTING
     if $worker->status == STATUS_EXECUTING
-      set $status="EXECUTING"
+      printf "EXECUTING"
     end
     end
     if $worker->status == STATUS_CALLBACK
     if $worker->status == STATUS_CALLBACK
-      set $status="CALLBACK"
+      printf "CALLBACK"
     end
     end
     if $worker->status == STATUS_SCHEDULING
     if $worker->status == STATUS_SCHEDULING
-      set $status="SCHEDULING"
+      printf "SCHEDULING"
     end
     end
     if $worker->status == STATUS_WAITING
     if $worker->status == STATUS_WAITING
-      set $status="WAITING"
+      printf "WAITING"
     end
     end
     if $worker->status == STATUS_SLEEPING_SCHEDULING
     if $worker->status == STATUS_SLEEPING_SCHEDULING
-      set $status="SLEEPING_SCHEDULING"
+      printf "SLEEPING_SCHEDULING"
     end
     end
     if $worker->status == STATUS_SLEEPING
     if $worker->status == STATUS_SLEEPING
-      set $status="SLEEPING"
+      printf "SLEEPING"
     end
     end
-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
+    printf "\n"
     set $num = $num + 1
     set $num = $num + 1
   end
   end
 end
 end
@@ -205,23 +208,24 @@ end
 define starpu-print-tag
 define starpu-print-tag
   set language c
   set language c
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\tstate "
   if $tag_struct->state == STARPU_INVALID_STATE
   if $tag_struct->state == STARPU_INVALID_STATE
-     set $status="STARPU_INVALID_STATE"
+     printf "STARPU_INVALID_STATE"
   end
   end
   if $tag_struct->state == STARPU_ASSOCIATED
   if $tag_struct->state == STARPU_ASSOCIATED
-     set $status="STARPU_ASSOCIATED"
+     printf "STARPU_ASSOCIATED"
   end
   end
   if $tag_struct->state == STARPU_BLOCKED
   if $tag_struct->state == STARPU_BLOCKED
-     set $status="STARPU_BLOCKED"
+     printf "STARPU_BLOCKED"
   end
   end
   if $tag_struct->state == STARPU_READY
   if $tag_struct->state == STARPU_READY
-     set $status="STARPU_READY"
+     printf "STARPU_READY"
   end
   end
   if $tag_struct->state == STARPU_DONE
   if $tag_struct->state == STARPU_DONE
-     set $status="STARPU_DONE"
+     printf "STARPU_DONE"
   end
   end
-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
-  printf "\tstate %s\n", $status
+  printf "\n"
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
@@ -317,7 +321,11 @@ define starpu-all-tasks
     while $l != &all_jobs_list
     while $l != &all_jobs_list
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $task = $j->task
       set $task = $j->task
-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
+      if $task->name
+        printf "task %p %s\n", $task, $task->name
+      else
+        printf "task %p\n", $task
+      end
       set $l = $l->next
       set $l = $l->next
     end
     end
   end
   end
@@ -915,9 +923,9 @@ end
 define starpu-sched-print-component
 define starpu-sched-print-component
     set $c = (struct starpu_sched_component *) $arg1
     set $c = (struct starpu_sched_component *) $arg1
     starpu-print-spaces $arg0
     starpu-print-spaces $arg0
-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
     if $c->push_task == fifo_push_task
     if $c->push_task == fifo_push_task
-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
       starpu-print-spaces $arg0
       starpu-print-spaces $arg0
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
     end
     end
@@ -951,29 +959,29 @@ end
 
 
 define starpu-mpi-print-request
 define starpu-mpi-print-request
     set $request = (struct _starpu_mpi_req *)$arg0
     set $request = (struct _starpu_mpi_req *)$arg0
-    set $request_type = "unknown_type"
+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
     if $request->request_type == SEND_REQ
     if $request->request_type == SEND_REQ
-       set $request_type = "SEND_REQ"
+       printf "SEND_REQ"
     end
     end
     if $request->request_type == RECV_REQ
     if $request->request_type == RECV_REQ
-       set $request_type = "RECV_REQ"
+       printf "RECV_REQ"
     end
     end
     if $request->request_type == WAIT_REQ
     if $request->request_type == WAIT_REQ
-       set $request_type = "WAIT_REQ"
+       printf "WAIT_REQ"
     end
     end
     if $request->request_type == TEST_REQ
     if $request->request_type == TEST_REQ
-       set $request_type = "TEST_REQ"
+       printf "TEST_REQ"
     end
     end
     if $request->request_type == BARRIER_REQ
     if $request->request_type == BARRIER_REQ
-       set $request_type = "BARRIER_REQ"
+       printf "BARRIER_REQ"
     end
     end
     if $request->request_type == PROBE_REQ
     if $request->request_type == PROBE_REQ
-       set $request_type = "PROBE_REQ"
+       printf "PROBE_REQ"
     end
     end
     if $request->request_type == UNKNOWN_REQ
     if $request->request_type == UNKNOWN_REQ
-       set $request_type = "UNKNOWN_REQ"
+       printf "UNKNOWN_REQ"
     end
     end
-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
 end
 end
 
 
 define starpu-mpi-print-ready-recv-requests
 define starpu-mpi-print-ready-recv-requests
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
     end
     end
 end
 end
 
 
+define starpu-mpi-print-requests-list
+  set $list = $arg0
+  set $request = $list->_head
+  while $request
+    starpu-mpi-print-request $request
+    set $request = $request->_next
+  end
+end
+
+define starpu-mpi-print-requests-tree
+  if $arg0
+    starpu-mpi-print-requests-tree $arg0->children[0]
+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
+    starpu-mpi-print-requests-list (&($stage->list))
+    starpu-mpi-print-requests-tree $arg0->children[1]
+  end
+end
+
 define starpu-mpi-print-ready-send-requests
 define starpu-mpi-print-ready-send-requests
-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
-    if $list
-	set $request = $list.list._head
-        while $request
-            starpu-mpi-print-request $request
-	    set $request = $request->_next
-	end
+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
+  if _starpu_debug
+    if $prio_list
+        starpu-mpi-print-requests-list &$prio_list.list
+    else
+	printf "No ready send requests\n"
+    end
+  else
+    if $prio_list.empty == 0
+        starpu-mpi-print-requests-tree $prio_list.tree.root
     else
     else
 	printf "No ready send requests\n"
 	printf "No ready send requests\n"
     end
     end
+  end
 end
 end
 
 
 define starpu-mpi-print-detached-requests
 define starpu-mpi-print-detached-requests

+ 2 - 2
tools/starpu_fxt_tool.c

@@ -81,7 +81,7 @@ static int parse_args(int argc, char **argv)
 		{
 		{
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			{
 			{
-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
 				return 7;
 				return 7;
 			}
 			}
 			options.filenames[options.ninputfiles++] = argv[++i];
 			options.filenames[options.ninputfiles++] = argv[++i];
@@ -179,7 +179,7 @@ static int parse_args(int argc, char **argv)
 		{
 		{
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			{
 			{
-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
 				return 7;
 				return 7;
 			}
 			}
 			options.filenames[options.ninputfiles++] = argv[i];
 			options.filenames[options.ninputfiles++] = argv[i];

+ 0 - 0
tools/starpu_perfmodel_recdump.c


Some files were not shown because too many files changed in this diff