瀏覽代碼

Merge branch 'master' into fpga

Nathalie Furmento 5 年之前
父節點
當前提交
536fe80f62
共有 100 個文件被更改,包括 1733 次插入795 次删除
  1. 2 0
      ChangeLog
  2. 22 12
      configure.ac
  3. 14 1
      doc/doxygen/chapters/210_check_list_performance.doxy
  4. 5 1
      doc/doxygen/chapters/370_online_performance_tools.doxy
  5. 13 2
      doc/doxygen/chapters/470_simgrid.doxy
  6. 8 0
      doc/doxygen/chapters/501_environment_variables.doxy
  7. 12 4
      doc/doxygen/chapters/510_configure_options.doxy
  8. 1 0
      examples/Makefile.am
  9. 11 0
      examples/api/bcsr_data_interface.c
  10. 10 0
      examples/api/block_data_interface.c
  11. 2 0
      examples/api/coo_data_interface.c
  12. 9 0
      examples/api/csr_data_interface.c
  13. 10 0
      examples/api/matrix_data_interface.c
  14. 2 0
      examples/api/multiformat_data_interface.c
  15. 37 0
      examples/api/tensor_data_interface.c
  16. 5 0
      examples/api/variable_data_interface.c
  17. 8 0
      examples/api/vector_data_interface.c
  18. 2 0
      examples/api/void_data_interface.c
  19. 17 20
      examples/cholesky/cholesky_implicit.c
  20. 5 0
      examples/filters/fblock.c
  21. 2 2
      examples/filters/fblock_opencl.c
  22. 13 10
      examples/filters/fblock_opencl_kernel.cl
  23. 19 1
      include/starpu_data.h
  24. 10 16
      include/starpu_fxt.h
  25. 7 2
      min-dgels/Makefile.in
  26. 0 152
      min-dgels/additional/blaswrap.h
  27. 8 0
      min-dgels/base/F2CLIBS/libf2c/Makefile
  28. 0 152
      min-dgels/base/INCLUDE/blaswrap.h
  29. 5 3
      mpi/examples/Makefile.am
  30. 10 5
      mpi/examples/benchs/abstract_sendrecv_bench.c
  31. 17 12
      mpi/examples/benchs/burst_helper.c
  32. 10 5
      mpi/examples/benchs/gemm_helper.c
  33. 2 1
      mpi/examples/benchs/sendrecv_bench.c
  34. 10 6
      mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
  35. 1 1
      mpi/examples/filters/filter.c
  36. 149 41
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  37. 14 11
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  38. 23 3
      mpi/examples/mpi_lu/plu_example.c
  39. 13 0
      mpi/examples/mpi_lu/plu_implicit_example.c
  40. 26 11
      mpi/examples/mpi_lu/plu_outofcore_example.c
  41. 18 14
      mpi/src/mpi/starpu_mpi_mpi.c
  42. 7 4
      mpi/src/nmad/starpu_mpi_nmad.c
  43. 1 1
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
  44. 2 2
      mpi/src/starpu_mpi_coop_sends.c
  45. 1 1
      mpi/src/starpu_mpi_fxt.h
  46. 2 0
      mpi/src/starpu_mpi_private.c
  47. 2 1
      mpi/src/starpu_mpi_private.h
  48. 2 2
      mpi/tests/Makefile.am
  49. 1 1
      mpi/tests/broadcast.c
  50. 18 2
      mpi/tests/early_request.c
  51. 1 1
      mpi/tests/insert_task_compute.c
  52. 5 3
      mpi/tests/pingpong.c
  53. 11 0
      src/common/fxt.c
  54. 2 0
      src/common/utils.h
  55. 4 16
      src/core/dependencies/cg.c
  56. 31 9
      src/core/dependencies/data_arbiter_concurrency.c
  57. 29 11
      src/core/dependencies/data_concurrency.c
  58. 2 2
      src/core/dependencies/data_concurrency.h
  59. 14 4
      src/core/dependencies/implicit_data_deps.c
  60. 1 1
      src/core/dependencies/implicit_data_deps.h
  61. 16 11
      src/core/dependencies/tags.c
  62. 2 0
      src/core/dependencies/tags.h
  63. 1 1
      src/core/jobs.c
  64. 3 0
      src/core/perfmodel/multiple_regression.c
  65. 13 10
      src/datawizard/coherency.c
  66. 1 0
      src/datawizard/coherency.h
  67. 4 4
      src/datawizard/copy_driver.c
  68. 6 2
      src/datawizard/interfaces/data_interface.c
  69. 31 8
      src/datawizard/user_interactions.c
  70. 1 1
      src/datawizard/write_back.c
  71. 2 2
      src/debug/latency.c
  72. 19 13
      src/debug/traces/starpu_fxt.c
  73. 1 0
      src/debug/traces/starpu_fxt.h
  74. 8 6
      src/debug/traces/starpu_fxt_mpi.c
  75. 7 4
      src/debug/traces/starpu_paje.c
  76. 30 25
      src/sched_policies/component_work_stealing.c
  77. 68 22
      src/sched_policies/work_stealing_policy.c
  78. 1 2
      src/util/execute_on_all.c
  79. 1 0
      src/util/starpu_data_cpy.c
  80. 18 1
      tests/Makefile.am
  81. 214 0
      tests/datawizard/acquire_release_to.c
  82. 3 3
      tests/datawizard/bcsr.c
  83. 3 3
      tests/datawizard/interfaces/block/block_opencl.c
  84. 15 23
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  85. 1 1
      tests/datawizard/interfaces/tensor/tensor_interface.c
  86. 3 3
      tests/datawizard/interfaces/tensor/tensor_opencl.c
  87. 18 26
      tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl
  88. 8 0
      tests/energy/energy_efficiency.c
  89. 4 4
      tests/helper/starpu_data_dup_ro.c
  90. 343 0
      tests/microbenchs/bandwidth.c
  91. 75 0
      tests/microbenchs/bandwidth_scheds.sh
  92. 1 0
      tests/overlap/overlap.c
  93. 1 1
      tests/parallel_tasks/explicit_combined_worker.c
  94. 1 0
      tools/Makefile.am
  95. 23 0
      tools/dev/valgrind/glpk.suppr
  96. 1 2
      tools/dev/valgrind/libc.suppr
  97. 1 1
      tools/dev/valgrind/padico.suppr
  98. 95 65
      tools/gdbinit
  99. 2 2
      tools/starpu_fxt_tool.c
  100. 0 0
      tools/starpu_perfmodel_recdump.c

+ 2 - 0
ChangeLog

@@ -42,6 +42,7 @@ New features:
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
+  * Add starpu_data_release_to() and starpu_data_release_to_on_node().
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
@@ -51,6 +52,7 @@ StarPU 1.3.5 (git revision xxx)
 
 Small changes:
   * Move MPI cache functions into the public API
+  * Add STARPU_MPI_NOBIND environment variable.
 
 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
 ====================================================================

+ 22 - 12
configure.ac

@@ -249,6 +249,8 @@ AC_ARG_WITH(simgrid-lib-dir,
 	], [simgrid_lib_dir=no])
 
 if test x$enable_simgrid = xyes ; then
+	PKG_CHECK_MODULES([SIMGRID], [simgrid])
+
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
@@ -836,7 +838,10 @@ if test x"$enable_native_winthreads" = xyes ; then
 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
 		AC_MSG_ERROR([pthread_create unavailable]))
 else
-    AC_CHECK_LIB([pthread], [pthread_create])
+    AC_CHECK_LIB([pthread], [pthread_create], [
+        LIBS="$LIBS -lpthread"
+        STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lpthread"
+    ])
 fi
 
 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
@@ -957,8 +962,8 @@ AC_CHECK_FUNCS([mkdtemp])
 
 AC_CHECK_FUNCS([pread pwrite])
 
-AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
-                    enable_hdf5=$enableval, enable_hdf5=maybe)
+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--enable-hdf5], [disable HDF5 support])],
+                    enable_hdf5=$enableval, enable_hdf5=no)
 
 if test "x$enable_hdf5" != xno ; then
 	AC_ARG_WITH(hdf5-include-dir,
@@ -1017,8 +1022,11 @@ fi
 
 if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
         AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
+	enable_hdf5=yes
+else
+	enable_hdf5=no
 fi
-AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
+AM_CONDITIONAL(STARPU_HAVE_HDF5, test "x$enable_hdf5" = "xyes")
 
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
@@ -2523,8 +2531,8 @@ AC_SUBST(STARPU_EXPORT_DYNAMIC)
 # Computes the maximum number of different kernels a message-passing sink
 # can lookup for and launch.
 AC_MSG_CHECKING(Maximum number of message-passing kernels)
-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
-	      -enable-maxmpkernels=<number>],
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING(
+	      [-enable-maxmpkernels=<number>],
 	      [maximum number of kernels a message-passing sink can lookup
 	      for and execute])],
 	      maxmpkernels=$enableval, maxmpkernels=10)
@@ -3111,6 +3119,9 @@ fi
 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
 			[Disable multiple linear regression models])],
 			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
+AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
+			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
+			enable_mlr_blas=$enableval, enable_mlr_blas=no)
 
 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
@@ -3121,11 +3132,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 	if test x$blas_lib = xnone ; then
 	   use_system_lapack=no
 	fi
-	if test x$use_system_lapack = xyes; then
+	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 		LDFLAGS="-llapack $LDFLAGS"
 	else
-		if test x$blas_lib = xmkl; then
+		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
 		else
 			AC_MSG_CHECKING(whether min-dgels is linked)
@@ -3142,9 +3153,6 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 					install_min_dgels=no
 					support_mlr=no
 				else
-					if test ! -d $PWD/min-dgels; then
-						cp -r $srcdir/min-dgels $PWD/
-					fi
 					AC_MSG_RESULT(yes)
 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
@@ -3590,7 +3598,7 @@ if test "x$enable_shared" = xno; then
         # No .so, so application will unexpected have to know which -l to
         # use. Give them in .pc file.
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
-	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
+	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
 fi
 AC_SUBST(STARPU_EXPORTED_LIBS)
 
@@ -3631,6 +3639,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/bandwidth_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/bandwidth_scheds.sh tests/microbenchs/
   mkdir -p tests/energy
   test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
   test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
@@ -3806,6 +3815,7 @@ AC_MSG_NOTICE([
                Scheduler Hypervisor:                          $build_sc_hypervisor
                simgrid enabled:                               $enable_simgrid
                ayudame enabled:                               $ayu_msg
+               HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr

+ 14 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -352,7 +352,20 @@ use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necess
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
 made on each architecture, to avoid bad scheduling decisions just because the
-first measurements were not so good. Details on the current performance model status
+first measurements were not so good.
+
+Note that StarPU will not record the very first measurement for a given codelet
+and a given size, because it would most often be hit by computation library
+loading or initialization. StarPU will also throw measurements away if it
+notices that after computing an average execution time, it notices that most
+subsequent tasks have an execution time largely outside the computed average
+("Too big deviation for model..." warning messages). By looking at the details
+of the message and their reported measurements, it can highlight that your
+computation library really has non-stable measurements, which is probably an
+indication of an issue in the computation library, or the execution environment
+(e.g. rogue daemons).
+
+Details on the current performance model status
 can be obtained with the tool <c>starpu_perfmodel_display</c>: the
 option <c>-l</c> lists the available performance models, and the
 option <c>-s</c> allows to choose the performance model to be

+ 5 - 1
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -375,7 +375,11 @@ parameter are stored in <c>.starpu/sampling/codelets/tmp/</c>
 directory. These files are reused when \ref STARPU_CALIBRATE
 environment variable is set to <c>1</c>, to recompute coefficients
 based on the current, but also on the previous
-executions. Additionally, when multiple linear regression models are
+executions. By default StarPU uses a lightweight dgels implementation, but the
+\ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
+used to make StarPU use a system-provided dgels BLAS.
+
+Additionally, when multiple linear regression models are
 disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
 <c>model->combinations</c> are not defined, StarPU will still write
 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow

+ 13 - 2
doc/doxygen/chapters/470_simgrid.doxy

@@ -167,8 +167,12 @@ theory results), see the \ref STARPU_SIMGRID_TRANSFER_COST, \ref STARPU_SIMGRID_
 
 \section SimulationMPIApplications MPI Applications
 
-StarPU-MPI applications can also be run in SimGrid mode. It needs to be compiled
-with \c smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
+StarPU-MPI applications can also be run in SimGrid mode. smpi currently requires
+that StarPU be build statically only, so <c>--disable-shared</c> needs to be
+passed to <c>./configure</c>.
+
+The application needs to be compiled with \c smpicc, and run using the
+<c>starpu_smpirun</c> script, for instance:
 
 \verbatim
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
@@ -182,6 +186,13 @@ in case of a heterogeneous platform, it is possible to use the
 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
+So as to use FxT traces, libfxt also needs to be built statically, <b>and</b>
+with dynamic linking flags, i.e. with
+
+\verbatim
+CFLAGS=-fPIC ./configure --enable-static
+\endverbatim
+
 \section SimulationDebuggingApplications Debugging Applications
 
 By default, SimGrid uses its own implementation of threads, which prevents \c gdb

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -255,6 +255,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 workers.
 </dd>
 
+<dt>STARPU_MPI_NOBIND</dt>
+<dd>
+\anchor STARPU_MPI_NOBIND
+\addindex __env__STARPU_MPI_NOBIND
+Setting it to non-zero will prevent StarPU from binding the MPI to
+a separate core. This is for instance useful when running the testsuite on a single system.
+</dd>
+
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dd>
 \anchor STARPU_WORKERS_CUDAID

+ 12 - 4
doc/doxygen/chapters/510_configure_options.doxy

@@ -571,11 +571,11 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 Enable linking with LevelDB if available
 </dd>
 
-<dt>--disable-hdf5</dt>
+<dt>--enable-hdf5</dt>
 <dd>
-\anchor disable-hdf5
-\addindex __configure__--disable-hdf5
-Disable building HDF5 support.
+\anchor enable-hdf5
+\addindex __configure__--enable-hdf5
+Enable building HDF5 support.
 </dd>
 
 <dt>--with-hdf5-include-dir=<c>path</c></dt>
@@ -768,6 +768,14 @@ this parameter is 10. Experimental.
 Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
 </dd>
 
+<dt>--enable-mlr-system-blas</dt>
+<dd>
+\anchor enable-mlr-system-blas
+\addindex __configure__--enable-mlr-system-blas
+Allow to make multiple linear regression models use the system-provided BLAS for dgels
+(see \ref PerformanceModelExample)
+</dd>
+
 </dl>
 
 */

+ 1 - 0
examples/Makefile.am

@@ -206,6 +206,7 @@ STARPU_EXAMPLES +=				\
 	api/csr_data_interface			\
 	api/matrix_data_interface		\
 	api/multiformat_data_interface		\
+	api/tensor_data_interface		\
 	api/variable_data_interface		\
 	api/vector_data_interface		\
 	api/void_data_interface

+ 11 - 0
examples/api/bcsr_data_interface.c

@@ -17,6 +17,17 @@
 // This program checks that the implementation of the BCSR data
 // interface only uses StarPU's public API
 
+#define starpu_interface_bcsr_ops my_starpu_interface_bcsr_ops
+#define starpu_bcsr_data_register my_starpu_bcsr_data_register
+#define starpu_bcsr_get_nnz my_starpu_bcsr_get_nnz
+#define starpu_bcsr_get_nrow my_starpu_bcsr_get_nrow
+#define starpu_bcsr_get_firstentry my_starpu_bcsr_get_firstentry
+#define starpu_bcsr_get_r my_starpu_bcsr_get_r
+#define starpu_bcsr_get_c my_starpu_bcsr_get_c
+#define starpu_bcsr_get_elemsize my_starpu_bcsr_get_elemsize
+#define starpu_bcsr_get_local_nzval my_starpu_bcsr_get_local_nzval
+#define starpu_bcsr_get_local_colind my_starpu_bcsr_get_local_colind
+#define starpu_bcsr_get_local_rowptr my_starpu_bcsr_get_local_rowptr
 #include "../../src/datawizard/interfaces/bcsr_interface.c"
 
 int main()

+ 10 - 0
examples/api/block_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the block data
 // interface only uses StarPU's public API
 
+#define starpu_interface_block_ops my_starpu_interface_block_ops
+#define starpu_block_data_register my_starpu_block_data_register
+#define starpu_block_ptr_register my_starpu_block_ptr_register
+#define starpu_block_get_nx my_starpu_block_get_nx
+#define starpu_block_get_ny my_starpu_block_get_ny
+#define starpu_block_get_nz my_starpu_block_get_nz
+#define starpu_block_get_local_ldy my_starpu_block_get_local_ldy
+#define starpu_block_get_local_ldz my_starpu_block_get_local_ldz
+#define starpu_block_get_local_ptr my_starpu_block_get_local_ptr
+#define starpu_block_get_elemsize my_starpu_block_get_elemsize
 #include "../../src/datawizard/interfaces/block_interface.c"
 
 int main()

+ 2 - 0
examples/api/coo_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the COO data
 // interface only uses StarPU's public API
 
+#define starpu_interface_coo_ops my_starpu_interface_coo_ops
+#define starpu_coo_data_register my_starpu_coo_data_register
 #include "../../src/datawizard/interfaces/coo_interface.c"
 
 int main()

+ 9 - 0
examples/api/csr_data_interface.c

@@ -17,6 +17,15 @@
 // This program checks that the implementation of the CSR data
 // interface only uses StarPU's public API
 
+#define starpu_interface_csr_ops my_starpu_interface_csr_ops
+#define starpu_csr_data_register my_starpu_csr_data_register
+#define starpu_csr_get_nnz my_starpu_csr_get_nnz
+#define starpu_csr_get_nrow my_starpu_csr_get_nrow
+#define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
+#define starpu_csr_get_elemsize my_starpu_csr_get_elemsize
+#define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
+#define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
+#define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
 #include "../../src/datawizard/interfaces/csr_interface.c"
 
 int main()

+ 10 - 0
examples/api/matrix_data_interface.c

@@ -17,6 +17,16 @@
 // This program checks that the implementation of the matrix data
 // interface only uses StarPU's public API
 
+#define starpu_interface_matrix_ops my_starpu_interface_matrix_ops
+#define starpu_matrix_data_register my_starpu_matrix_data_register
+#define starpu_matrix_data_register_allocsize my_starpu_matrix_data_register_allocsize
+#define starpu_matrix_ptr_register my_starpu_matrix_data_ptr_register
+#define starpu_matrix_get_nx my_starpu_matrix_get_nx
+#define starpu_matrix_get_ny my_starpu_matrix_get_ny
+#define starpu_matrix_get_local_ld my_starpu_matrix_get_local_ld
+#define starpu_matrix_get_local_ptr my_starpu_matrix_get_local_ptr
+#define starpu_matrix_get_elemsize my_starpu_matrix_get_elemsize
+#define starpu_matrix_get_allocsize my_starpu_matrix_get_allocsize
 #include "../../src/datawizard/interfaces/matrix_interface.c"
 
 int main()

+ 2 - 0
examples/api/multiformat_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the multiformat data
 // interface only uses StarPU's public API
 
+#define starpu_interface_multiformat_ops my_starpu_interface_multiformat_ops
+#define starpu_multiformat_data_register my_starpu_multiformat_data_register
 #include "../../src/datawizard/interfaces/multiformat_interface.c"
 
 int main()

+ 37 - 0
examples/api/tensor_data_interface.c

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+// This program checks that the implementation of the tensor data
+// interface only uses StarPU's public API
+
+#define starpu_interface_tensor_ops my_starpu_interface_tensor_ops
+#define starpu_tensor_data_register my_starpu_tensor_data_register
+#define starpu_tensor_ptr_register my_starpu_tensor_data_ptr_register
+#define starpu_tensor_get_nx my_starpu_tensor_get_nx
+#define starpu_tensor_get_ny my_starpu_tensor_get_ny
+#define starpu_tensor_get_nz my_starpu_tensor_get_nz
+#define starpu_tensor_get_nt my_starpu_tensor_get_nt
+#define starpu_tensor_get_local_ldy my_starpu_tensor_get_local_ldy
+#define starpu_tensor_get_local_ldz my_starpu_tensor_get_local_ldz
+#define starpu_tensor_get_local_ldt my_starpu_tensor_get_local_ldt
+#define starpu_tensor_get_local_ptr my_starpu_tensor_get_local_ptr
+#define starpu_tensor_get_elemsize my_starpu_tensor_get_elemsize
+#include "../../src/datawizard/interfaces/tensor_interface.c"
+
+int main()
+{
+        return 0;
+}

+ 5 - 0
examples/api/variable_data_interface.c

@@ -17,6 +17,11 @@
 // This program checks that the implementation of the variable data
 // interface only uses StarPU's public API
 
+#define starpu_interface_variable_ops my_starpu_interface_variable_ops
+#define starpu_variable_data_register my_starpu_variable_data_register
+#define starpu_variable_ptr_register my_starpu_variable_ptr_register
+#define starpu_variable_get_local_ptr my_starpu_variable_get_local_ptr
+#define starpu_variable_get_elemsize my_starpu_variable_get_elemsize
 #include "../../src/datawizard/interfaces/variable_interface.c"
 
 int main()

+ 8 - 0
examples/api/vector_data_interface.c

@@ -17,6 +17,14 @@
 // This program checks that the implementation of the vector data
 // interface only uses StarPU's public API
 
+#define starpu_interface_vector_ops my_starpu_interface_vector_ops
+#define starpu_vector_data_register my_starpu_vector_data_register
+#define starpu_vector_data_register_allocsize my_starpu_vector_data_register_allocsize
+#define starpu_vector_ptr_register my_starpu_vector_data_ptr_register
+#define starpu_vector_get_nx my_starpu_vector_get_nx
+#define starpu_vector_get_local_ptr my_starpu_vector_get_local_ptr
+#define starpu_vector_get_elemsize my_starpu_vector_get_elemsize
+#define starpu_vector_get_allocsize my_starpu_vector_get_allocsize
 #include "../../src/datawizard/interfaces/vector_interface.c"
 
 int main()

+ 2 - 0
examples/api/void_data_interface.c

@@ -17,6 +17,8 @@
 // This program checks that the implementation of the void data
 // interface only uses StarPU's public API
 
+#define starpu_interface_void_ops my_starpu_interface_void_ops
+#define starpu_void_data_register my_starpu_void_data_register
 #include "../../src/datawizard/interfaces/void_interface.c"
 
 int main()

+ 17 - 20
examples/cholesky/cholesky_implicit.c

@@ -92,29 +92,26 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		}
 		starpu_data_wont_use(sdatakk);
 
-		for (m = k+1; m<nblocks; m++)
+		for (n = k+1; n<nblocks; n++)
 		{
-                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
-			for (n = k+1; n<nblocks; n++)
+                        starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
+			for (m = n; m<nblocks; m++)
 			{
-				if (n <= m)
-                                {
-					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
-					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
-
-					ret = starpu_task_insert(&cl22,
-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-								 STARPU_R, sdatamk,
-								 STARPU_R, sdatank,
-								 cl22.modes[2], sdatamn,
-								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
-								 STARPU_TAG_ONLY, TAG22(k,m,n),
-								 0);
-					if (ret == -ENODEV) return 77;
-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-                                }
+				starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
+				starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
+
+				ret = starpu_task_insert(&cl22,
+							 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							 STARPU_R, sdatamk,
+							 STARPU_R, sdatank,
+							 cl22.modes[2], sdatamn,
+							 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+							 STARPU_TAG_ONLY, TAG22(k,m,n),
+							 0);
+				if (ret == -ENODEV) return 77;
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 			}
-			starpu_data_wont_use(sdatamk);
+			starpu_data_wont_use(sdatank);
 		}
 		starpu_iteration_pop();
 	}

+ 5 - 0
examples/filters/fblock.c

@@ -169,6 +169,11 @@ int main(void)
         print_data(handle);
         starpu_data_unregister(handle);
 
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+
         /* Print result block */
         FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -60,8 +60,8 @@ void opencl_func(void *buffers[], void *cl_arg)
 	CHECK_CL_SET_KERNEL_ARG(kernel, 7, sizeof(*factor), factor);
 
 	{
-		size_t global=nx*ny*nz;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
+		size_t global[3]={nx,ny,nz};
+		err = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, NULL, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	starpu_opencl_release_kernel(kernel);

+ 13 - 10
examples/filters/fblock_opencl_kernel.cl

@@ -18,14 +18,17 @@
 
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
-        int i, j, k;
-        block = (__global char *)block + offset;
-        for(k=0; k<nz ; k++)
-	{
-                for(j=0; j<ny ; j++)
-		{
-                        for(i=0; i<nx ; i++)
-                                block[(k*ldz)+(j*ldy)+i] = factor;
-                }
-        }
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
+		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	block = (__global int*) ((__global char *)block + offset);
+	int i = idz*ldz + idy*ldy + idx;
+	block[i] = factor;
 }

+ 19 - 1
include/starpu_data.h

@@ -350,12 +350,30 @@ void starpu_data_release(starpu_data_handle_t handle);
 
 /**
    Similar to starpu_data_release(), except that the data
-   will be available on the given memory \p node instead of main memory.
+   was made available on the given memory \p node instead of main memory.
    The \p node parameter must be exactly the same as the corresponding \c
    starpu_data_acquire_on_node* call.
 */
 void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 
+/**
+   Partly release the piece of data acquired by the application either by
+   starpu_data_acquire() or by starpu_data_acquire_cb(), switching the
+   acquisition down to \p down_to_mode. For now, only releasing from STARPU_RW
+   or STARPU_W acquisition down to STARPU_R is supported, or down to the same
+   acquisition.  STARPU_NONE can also be passed as \p down_to_mode, in which
+   case this is equivalent to calling starpu_data_release().
+*/
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+
+/**
+   Similar to starpu_data_release_to(), except that the data
+   was made available on the given memory \p node instead of main memory.
+   The \p node parameter must be exactly the same as the corresponding \c
+   starpu_data_acquire_on_node* call.
+*/
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode, int node);
+
 /** @} */
 
 /**

+ 10 - 16
include/starpu_fxt.h

@@ -69,42 +69,36 @@ struct starpu_fxt_options
 	char *number_events_path;
 	char *anim_path;
 	char *states_path;
+	char worker_names[STARPU_NMAXWORKERS][256];
+	int nworkers;
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   MPI processes), we may need to prefix the name of the containers.
 	*/
 	char *file_prefix;
+
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the time offset with the rank 0.
 	*/
 	uint64_t file_offset;
+
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the MPI rank of the trace file.
 	*/
 	int file_rank;
 
 	/**
-	   Output parameters
-	*/
-	char worker_names[STARPU_NMAXWORKERS][256];
-	/**
-	   Output parameters
-	*/
-	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
-	/**
-	   Output parameters
-	*/
-	int nworkers;
-
-	/**
 	   In case we want to dump the list of codelets to an external tool
 	*/
 	struct starpu_fxt_codelet_event **dumped_codelets;
+
 	/**
-	   In case we want to dump the list of codelets to an external tool
+	   In case we want to dump the list of codelets to an external tool, number
+	   of dumped codelets.
 	*/
 	long dumped_codelets_count;
 };

+ 7 - 2
min-dgels/Makefile.in

@@ -7,13 +7,17 @@ ADDITIONAL=additional
 
 all:
 	mkdir -p build
-	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
+	[ -d "$(CLAPACK)" ] || ( cp -a $(srcdir)/$(CLAPACK) . ; chmod -R +rwX $(CLAPACK) )
 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
-	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
+	[ -d "$(ADDITIONAL)" ] || ( cp -a $(srcdir)/$(ADDITIONAL) . ; chmod -R +rwX $(ADDITIONAL) )
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 install:
+installcheck:
+uninstall:
+distuninstallcheck:
+dvi:
 
 clean:
 	-cd $(CLAPACK) && $(MAKE) clean && rm -rf *~
@@ -21,6 +25,7 @@ clean:
 	rm -rf build *~
 
 distclean: clean
+	[ -f Makefile.in ] || rm -fr $(CLAPACK) $(ADDITIONAL)
 
 # This part is needed by StarPU
 

+ 0 - 152
min-dgels/additional/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */

+ 8 - 0
min-dgels/base/F2CLIBS/libf2c/Makefile

@@ -175,6 +175,14 @@ xwsne.o:	fio.h
 xwsne.o:	lio.h
 xwsne.o:	fmt.h
 
+main.o:		signal1.h
+signal_.o:	signal1.h
+s_paus.o:	signal1.h
+
+err.o:		sysdep1.h
+fio.h:		sysdep1.h
+util.c:		sysdep1.h
+
 arith.h: arithchk.c
 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm

+ 0 - 152
min-dgels/base/INCLUDE/blaswrap.h

@@ -5,156 +5,4 @@
 #ifndef __BLASWRAP_H
 #define __BLASWRAP_H
 
-#ifndef NO_BLAS_WRAP
- 
-/* BLAS1 routines */
-#define _starpu_srotg_ f2c_srotg
-#define _starpu_crotg_ f2c_crotg
-#define _starpu_drotg_ f2c_drotg
-#define _starpu_zrotg_ f2c_zrotg
-#define _starpu_srotmg_ f2c_srotmg
-#define _starpu_drotmg_ f2c_drotmg
-#define _starpu_srot_ f2c_srot
-#define _starpu_drot_ f2c_drot
-#define _starpu_srotm_ f2c_srotm
-#define _starpu_drotm_ f2c_drotm
-#define _starpu_sswap_ f2c_sswap
-#define _starpu_dswap_ f2c_dswap
-#define _starpu_cswap_ f2c_cswap
-#define _starpu_zswap_ f2c_zswap
-#define _starpu_sscal_ f2c_sscal
-#define _starpu_dscal_ f2c_dscal
-#define _starpu_cscal_ f2c_cscal
-#define _starpu_zscal_ f2c_zscal
-#define _starpu_csscal_ f2c_csscal
-#define _starpu_zdscal_ f2c_zdscal
-#define _starpu_scopy_ f2c_scopy
-#define _starpu_dcopy_ f2c_dcopy
-#define _starpu_ccopy_ f2c_ccopy
-#define _starpu_zcopy_ f2c_zcopy
-#define _starpu_saxpy_ f2c_saxpy
-#define _starpu_daxpy_ f2c_daxpy
-#define _starpu_caxpy_ f2c_caxpy
-#define _starpu_zaxpy_ f2c_zaxpy
-#define _starpu_sdot_ f2c_sdot
-#define _starpu_ddot_ f2c_ddot
-#define _starpu_cdotu_ f2c_cdotu
-#define _starpu_zdotu_ f2c_zdotu
-#define _starpu_cdotc_ f2c_cdotc
-#define _starpu_zdotc_ f2c_zdotc
-#define _starpu_snrm2_ f2c_snrm2
-#define _starpu_dnrm2_ f2c_dnrm2
-#define _starpu_scnrm2_ f2c_scnrm2
-#define _starpu_dznrm2_ f2c_dznrm2
-#define _starpu_sasum_ f2c_sasum
-#define _starpu_dasum_ f2c_dasum
-#define _starpu_scasum_ f2c_scasum
-#define _starpu_dzasum_ f2c_dzasum
-#define _starpu_isamax_ f2c_isamax
-#define _starpu_idamax_ f2c_idamax
-#define _starpu_icamax_ f2c_icamax
-#define _starpu_izamax_ f2c_izamax
- 
-/* BLAS2 routines */
-#define _starpu_sgemv_ f2c_sgemv
-#define _starpu_dgemv_ f2c_dgemv
-#define _starpu_cgemv_ f2c_cgemv
-#define _starpu_zgemv_ f2c_zgemv
-#define _starpu_sgbmv_ f2c_sgbmv
-#define _starpu_dgbmv_ f2c_dgbmv
-#define _starpu_cgbmv_ f2c_cgbmv
-#define _starpu_zgbmv_ f2c_zgbmv
-#define _starpu_chemv_ f2c_chemv
-#define _starpu_zhemv_ f2c_zhemv
-#define _starpu_chbmv_ f2c_chbmv
-#define _starpu_zhbmv_ f2c_zhbmv
-#define _starpu_chpmv_ f2c_chpmv
-#define _starpu_zhpmv_ f2c_zhpmv
-#define _starpu_ssymv_ f2c_ssymv
-#define _starpu_dsymv_ f2c_dsymv
-#define _starpu_ssbmv_ f2c_ssbmv
-#define _starpu_dsbmv_ f2c_dsbmv
-#define _starpu_sspmv_ f2c_sspmv
-#define _starpu_dspmv_ f2c_dspmv
-#define _starpu_strmv_ f2c_strmv
-#define _starpu_dtrmv_ f2c_dtrmv
-#define _starpu_ctrmv_ f2c_ctrmv
-#define _starpu_ztrmv_ f2c_ztrmv
-#define _starpu_stbmv_ f2c_stbmv
-#define _starpu_dtbmv_ f2c_dtbmv
-#define _starpu_ctbmv_ f2c_ctbmv
-#define _starpu_ztbmv_ f2c_ztbmv
-#define _starpu_stpmv_ f2c_stpmv
-#define _starpu_dtpmv_ f2c_dtpmv
-#define _starpu_ctpmv_ f2c_ctpmv
-#define _starpu_ztpmv_ f2c_ztpmv
-#define _starpu_strsv_ f2c_strsv
-#define _starpu_dtrsv_ f2c_dtrsv
-#define _starpu_ctrsv_ f2c_ctrsv
-#define _starpu_ztrsv_ f2c_ztrsv
-#define _starpu_stbsv_ f2c_stbsv
-#define _starpu_dtbsv_ f2c_dtbsv
-#define _starpu_ctbsv_ f2c_ctbsv
-#define _starpu_ztbsv_ f2c_ztbsv
-#define _starpu_stpsv_ f2c_stpsv
-#define _starpu_dtpsv_ f2c_dtpsv
-#define _starpu_ctpsv_ f2c_ctpsv
-#define _starpu_ztpsv_ f2c_ztpsv
-#define _starpu_sger_ f2c_sger
-#define _starpu_dger_ f2c_dger
-#define _starpu_cgeru_ f2c_cgeru
-#define _starpu_zgeru_ f2c_zgeru
-#define _starpu_cgerc_ f2c_cgerc
-#define _starpu_zgerc_ f2c_zgerc
-#define _starpu_cher_ f2c_cher
-#define _starpu_zher_ f2c_zher
-#define _starpu_chpr_ f2c_chpr
-#define _starpu_zhpr_ f2c_zhpr
-#define _starpu_cher2_ f2c_cher2
-#define _starpu_zher2_ f2c_zher2
-#define _starpu_chpr2_ f2c_chpr2
-#define _starpu_zhpr2_ f2c_zhpr2
-#define _starpu_ssyr_ f2c_ssyr
-#define _starpu_dsyr_ f2c_dsyr
-#define _starpu_sspr_ f2c_sspr
-#define _starpu_dspr_ f2c_dspr
-#define _starpu_ssyr2_ f2c_ssyr2
-#define _starpu_dsyr2_ f2c_dsyr2
-#define _starpu_sspr2_ f2c_sspr2
-#define _starpu_dspr2_ f2c_dspr2
- 
-/* BLAS3 routines */
-#define _starpu_sgemm_ f2c_sgemm
-#define _starpu_dgemm_ f2c_dgemm
-#define _starpu_cgemm_ f2c_cgemm
-#define _starpu_zgemm_ f2c_zgemm
-#define _starpu_ssymm_ f2c_ssymm
-#define _starpu_dsymm_ f2c_dsymm
-#define _starpu_csymm_ f2c_csymm
-#define _starpu_zsymm_ f2c_zsymm
-#define _starpu_chemm_ f2c_chemm
-#define _starpu_zhemm_ f2c_zhemm
-#define _starpu_ssyrk_ f2c_ssyrk
-#define _starpu_dsyrk_ f2c_dsyrk
-#define _starpu_csyrk_ f2c_csyrk
-#define _starpu_zsyrk_ f2c_zsyrk
-#define _starpu_cherk_ f2c_cherk
-#define _starpu_zherk_ f2c_zherk
-#define _starpu_ssyr2k_ f2c_ssyr2k
-#define _starpu_dsyr2k_ f2c_dsyr2k
-#define _starpu_csyr2k_ f2c_csyr2k
-#define _starpu_zsyr2k_ f2c_zsyr2k
-#define _starpu_cher2k_ f2c_cher2k
-#define _starpu_zher2k_ f2c_zher2k
-#define _starpu_strmm_ f2c_strmm
-#define _starpu_dtrmm_ f2c_dtrmm
-#define _starpu_ctrmm_ f2c_ctrmm
-#define _starpu_ztrmm_ f2c_ztrmm
-#define _starpu_strsm_ f2c_strsm
-#define _starpu_dtrsm_ f2c_dtrsm
-#define _starpu_ctrsm_ f2c_ctrsm
-#define _starpu_ztrsm_ f2c_ztrsm
-
-#endif /* NO_BLAS_WRAP */
-
 #endif /* __BLASWRAP_H */

+ 5 - 3
mpi/examples/Makefile.am

@@ -47,10 +47,10 @@ endif
 endif
 
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
 if STARPU_MPI_CHECK
@@ -155,7 +155,9 @@ examplebin_PROGRAMS += 			\
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 endif
 
 mpi_lu_plu_example_float_LDADD =	\

+ 10 - 5
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -22,25 +22,30 @@
 void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 {
 	uint64_t iterations = LOOPS_DEFAULT;
+	uint64_t s = 0;
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 	if (mpi_rank >= 2)
 	{
+		starpu_pause();
 		if (thread_barrier != NULL)
 		{
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
 		}
 
-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+		for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 		{
 			iterations = bench_nb_iterations(iterations, s);
 
 			starpu_mpi_barrier(MPI_COMM_WORLD);
 
-			for (uint64_t j = 0; j < iterations; j++)
+			for (j = 0; j < iterations; j++)
 			{
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 			}
 		}
+		starpu_resume();
 
 		return;
 	}
@@ -64,7 +69,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 	}
 
 	global_tstart = starpu_timing_now();
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 		vector_send = malloc(s);
 		vector_recv = malloc(s);
@@ -78,7 +83,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
 		starpu_mpi_barrier(MPI_COMM_WORLD);
 
-		for (uint64_t j = 0; j < iterations; j++)
+		for (j = 0; j < iterations; j++)
 		{
 			if (mpi_rank == 0)
 			{
@@ -111,7 +116,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 			const double d9_lat = lats[9 * (iterations - 1) / 10];
 			double avg_lat = 0.0;
 
-			for(uint64_t k = 0; k < iterations; k++)
+			for(k = 0; k < iterations; k++)
 			{
 				avg_lat += lats[k];
 			}

+ 17 - 12
mpi/examples/benchs/burst_helper.c

@@ -45,7 +45,8 @@ void burst_init_data(int rank)
 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
@@ -62,7 +63,8 @@ void burst_free_data(int rank)
 {
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			starpu_data_unregister(send_handles[i]);
 			free(send_buffers[i]);
@@ -84,12 +86,13 @@ void burst_free_data(int rank)
 void burst_bidir(int rank)
 {
 	int other_rank = (rank == 0) ? 1 : 0;
+	int i = 0;
 
 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -100,13 +103,13 @@ void burst_bidir(int rank)
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -120,10 +123,11 @@ void burst_bidir(int rank)
 void burst_unidir(int sender, int receiver, int rank)
 {
 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
+	int i = 0;
 
 	if (rank == receiver)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
@@ -134,7 +138,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 	if (rank == sender)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
@@ -143,7 +147,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 	if (rank == sender || rank == receiver)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -160,12 +164,13 @@ void burst_bidir_half_postponed(int rank)
 {
 	int other_rank = (rank == 0) ? 1 : 0;
 	int received = 0;
+	int i = 0;
 
 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -176,7 +181,7 @@ void burst_bidir_half_postponed(int rank)
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < (burst_nb_requests / 2); i++)
+		for (i = 0; i < (burst_nb_requests / 2); i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
@@ -184,13 +189,13 @@ void burst_bidir_half_postponed(int rank)
 
 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
 
-		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
+		for (i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);

+ 10 - 5
mpi/examples/benchs/gemm_helper.c

@@ -98,8 +98,9 @@ static void cpu_init_matrix_random(void *descr[], void *arg)
 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 		subA[i] = (TYPE) (starpu_drand48());
 		subB[i] = (TYPE) (starpu_drand48());
@@ -113,8 +114,9 @@ static void cpu_init_matrix_zero(void *descr[], void *arg)
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 		subA[i] = (TYPE) (0);
 	}
@@ -290,18 +292,21 @@ void gemm_add_polling_dependencies()
 {
 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
 	unsigned nb_workers = starpu_worker_get_count();
+	starpu_tag_t synchro_tag = 0;
+	starpu_tag_t previous_tag = 0;
+	starpu_tag_t next_tag = 0;
 
-	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	for (synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
 	{
 		// this synchro tag depends on tasks of previous column of tasks:
-		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		for (previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
 		{
 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
 		}
 
 		// tasks of the next column of tasks depend on this synchro tag:
 		// this actually allows workers to poll for new tasks, while no task is available
-		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		for (next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
 		{
 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
 		}

+ 2 - 1
mpi/examples/benchs/sendrecv_bench.c

@@ -27,9 +27,10 @@ int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
 	int pause_workers = 0;
+	int i = 0;
 
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 		if (strcmp(argv[i], "-p") == 0)
 		{

+ 10 - 6
mpi/examples/benchs/sendrecv_parallel_tasks_bench.c

@@ -56,6 +56,8 @@ void cpu_task(void* descr[], void* args)
 	double t1, t2;
 	int asked_worker;
 	int current_worker = starpu_worker_get_id();
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
 
@@ -64,7 +66,7 @@ void cpu_task(void* descr[], void* args)
 	iterations = bench_nb_iterations(iterations, s);
 	double* lats = malloc(sizeof(double) * iterations);
 
-	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
+	for (j = 0; j < NB_WARMUP_PINGPONGS; j++)
 	{
 		if (mpi_rank == 0)
 		{
@@ -78,7 +80,7 @@ void cpu_task(void* descr[], void* args)
 		}
 	}
 
-	for (uint64_t j = 0; j < iterations; j++)
+	for (j = 0; j < iterations; j++)
 	{
 		if (mpi_rank == 0)
 		{
@@ -107,7 +109,7 @@ void cpu_task(void* descr[], void* args)
 		const double d9_lat = lats[9 * (iterations - 1) / 10];
 		double avg_lat = 0.0;
 
-		for(uint64_t k = 0; k < iterations; k++)
+		for(k = 0; k < iterations; k++)
 		{
 			avg_lat += lats[k];
 		}
@@ -167,6 +169,8 @@ int main(int argc, char **argv)
 	unsigned cpu_count = starpu_cpu_worker_get_count();
 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
 	unsigned tag = 0;
+	uint64_t s = 0;
+	unsigned i = 0;
 
 	int* workers = malloc(cpu_count * sizeof(int));
 	float** vectors_send = malloc(cpu_count * sizeof(float*));
@@ -174,11 +178,11 @@ int main(int argc, char **argv)
 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
 
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 		starpu_pause();
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 			workers[i] = i;
 			vectors_send[i] = malloc(s);
@@ -201,7 +205,7 @@ int main(int argc, char **argv)
 		starpu_resume();
 		starpu_task_wait_for_all();
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 			starpu_data_unregister(handles_send[i]);
 			starpu_data_unregister(handles_recv[i]);

+ 1 - 1
mpi/examples/filters/filter.c

@@ -59,7 +59,7 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
 
 	STARPU_ASSERT_MSG(nchunks <= nx, "%u parts for %u elements", nchunks, nx);
 	STARPU_ASSERT(nchunks == 2);
-	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%d is not a multiple of nchunks %d\n", nx, nchunks);
+	STARPU_ASSERT_MSG((nx % nchunks) == 0, "nx=%u is not a multiple of nchunks %u\n", nx, nchunks);
 
 	vector_child->id = vector_father->id;
 	vector_child->nx = nx/2;

+ 149 - 41
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -20,6 +20,64 @@
 #include <limits.h>
 #include <math.h>
 
+/* This is from magma
+
+  -- Innovative Computing Laboratory
+  -- Electrical Engineering and Computer Science Department
+  -- University of Tennessee
+  -- (C) Copyright 2009
+
+  Redistribution  and  use  in  source and binary forms, with or without
+  modification,  are  permitted  provided  that the following conditions
+  are met:
+
+  * Redistributions  of  source  code  must  retain  the above copyright
+    notice,  this  list  of  conditions  and  the  following  disclaimer.
+  * Redistributions  in  binary  form must reproduce the above copyright
+    notice,  this list of conditions and the following disclaimer in the
+    documentation  and/or other materials provided with the distribution.
+  * Neither  the  name of the University of Tennessee, Knoxville nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  */
+
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+
+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
+
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
+
+#define FMULS_TRSM FMULS_TRMM
+#define FADDS_TRSM FMULS_TRMM
+
+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
+
+
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+
+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
+
+/* End of magma code */
+
 /*
  *	Create the codelets
  */
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 {
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 				       STARPU_RW, data_handles[k][k],
+				       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 				       0);
 
 		for (m = k+1; m<nblocks; m++)
@@ -88,28 +148,30 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
 					       STARPU_RW, data_handles[m][k],
+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 					       0);
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			if (my_distrib(k, k, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][k]);
+		}
 
-			for (n = k+1; n<nblocks; n++)
+		for (n = k+1; n<nblocks; n++)
+		{
+			for (m = n; m<nblocks; m++)
 			{
-				if (n <= m)
-				{
-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[n][k],
-							       STARPU_R, data_handles[m][k],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
-							       0);
-				}
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+						       0);
 			}
 
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-			if (my_distrib(m, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[m][k]);
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+			if (my_distrib(n, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[n][k]);
 		}
 		starpu_iteration_pop();
 	}
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 {
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 	/* Column */
 	for (n = 0; n<nblocks; n++)
@@ -137,7 +200,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == n)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+					starpu_data_wont_use(data_handles[m][k]);
+				}
 			}
 			k = n;
 			if (m > n)
@@ -147,6 +218,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 			}
 			else
@@ -155,26 +227,27 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
+
 		}
 
+		/* We won't need it any more */
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+		starpu_data_wont_use(data_handles[n][n]);
+
 		starpu_iteration_pop();
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
-	unsigned a, c;
+	unsigned a;
 	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 	/* double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
@@ -205,7 +278,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 
 			/* k = n */
@@ -216,6 +297,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 			}
 			else
@@ -224,8 +306,16 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 
 		/* column within second antidiagonal for a */
@@ -246,7 +336,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 						       STARPU_R, data_handles[n][k],
 						       STARPU_R, data_handles[m][k],
 						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
+
+				if (m == nblocks-1)
+				{
+					/* Nobody else will need it */
+					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					starpu_data_wont_use(data_handles[n][k]);
+				}
 			}
 			/* non-diagonal block, solve */
 			k = n;
@@ -254,17 +352,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
 					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
 					       STARPU_RW, data_handles[m][k],
+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 					       0);
+
+			if (m == nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
 		}
 
 		starpu_iteration_pop();
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
@@ -273,9 +373,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	unsigned a;
 	int k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+	unsigned nn = size/nblocks;
 
 	/*
-	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
 	 * double-antidiagonal number:
 	 * - a=0 contains (0,0) plus (1,0)
 	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
@@ -285,41 +386,47 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 	{
 		starpu_iteration_push(a);
 
-		for (k = 0; k < nblocks; k++)
+		for (k = 0; k < (int) nblocks; k++)
 		{
 			n = k;
 			/* Should be m = a-k-n; for potrf and trsm to respect
 			   priorities, but needs to be this for dependencies */
 			m = a-2*k-n;
 
-			if (m < 0 || m >= nblocks)
-				continue;
-
 			if (m == n)
 			{
 				/* diagonal block, factorize */
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
 						       STARPU_RW, data_handles[k][k],
+						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 						       0);
 			}
-			else
+			else if (m >= n && m < (int) nblocks)
 			{
 				/* non-diagonal block, solve */
 				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
 						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						       STARPU_R, data_handles[k][k],
 						       STARPU_RW, data_handles[m][k],
+						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						       0);
 			}
 
+			if (m == (int) nblocks - 1)
+			{
+				/* We do not need the potrf result any more */
+				starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
+				starpu_data_wont_use(data_handles[n][n]);
+			}
+
 			/* column within antidiagonal for a */
-			for (n = k + 1; n < nblocks; n++)
+			for (n = k + 1; n < (int) nblocks; n++)
 			{
 				/* row */
 				m = a-2*k-n;
 
-				if (m >= n && m < nblocks)
+				if (m >= n && m < (int) nblocks)
 				{
 					/* Update */
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
@@ -327,7 +434,14 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 							       STARPU_R, data_handles[n][k],
 							       STARPU_R, data_handles[m][k],
 							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 							       0);
+					if (m == (int) nblocks - 1)
+					{
+						/* Nobody else will need it */
+						starpu_data_wont_use(data_handles[n][k]);
+						starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
+					}
 				}
 			}
 
@@ -335,12 +449,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
 
 		starpu_iteration_pop();
 	}
-
-	/* Submit flushes, StarPU will fit them according to the progress */
-	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-	for (m = 0; m < nblocks; m++)
-		for (n = 0; n < nblocks ; n++)
-			starpu_data_wont_use(data_handles[m][n]);
 }
 
 /*
@@ -423,7 +531,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	if (rank == 0)
 	{
 		*timing = end - start;
-		*flops = (1.0f*size*size*size)/3.0f;
+		*flops = FLOPS_SPOTRF(size);
 	}
 }
 

+ 14 - 11
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -56,63 +56,66 @@ void parse_args(int argc, char **argv, int nodes)
                         size = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-dblockx") == 0)
+                else if (strcmp(argv[i], "-dblockx") == 0)
                 {
                         char *argptr;
                         dblockx = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-dblocky") == 0)
+                else if (strcmp(argv[i], "-dblocky") == 0)
                 {
                         char *argptr;
                         dblocky = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-nblocks") == 0)
+                else if (strcmp(argv[i], "-nblocks") == 0)
                 {
                         char *argptr;
                         nblocks = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-nbigblocks") == 0)
+                else if (strcmp(argv[i], "-nbigblocks") == 0)
                 {
                         char *argptr;
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
 
-                if (strcmp(argv[i], "-columns") == 0)
+                else if (strcmp(argv[i], "-columns") == 0)
                 {
                         submission = COLUMNS;
                 }
 
-                if (strcmp(argv[i], "-antidiagonals") == 0)
+                else if (strcmp(argv[i], "-antidiagonals") == 0)
                 {
                         submission = ANTIDIAGONALS;
                 }
 
-                if (strcmp(argv[i], "-prios") == 0)
+                else if (strcmp(argv[i], "-prios") == 0)
                 {
                         submission = PRIOS;
                 }
 
-                if (strcmp(argv[i], "-no-prio") == 0)
+                else if (strcmp(argv[i], "-no-prio") == 0)
                 {
                         noprio = 1;
                 }
 
-                if (strcmp(argv[i], "-check") == 0)
+                else if (strcmp(argv[i], "-check") == 0)
                 {
                         check = 1;
                 }
 
-                if (strcmp(argv[i], "-display") == 0)
+                else if (strcmp(argv[i], "-display") == 0)
                 {
                         display = 1;
                 }
 
-                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+                else
+                /* if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) */
                 {
                         printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
+                        fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
+                        exit(0);
                 }
         }
 

+ 23 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -37,8 +37,8 @@
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 
@@ -434,6 +434,7 @@ int main(int argc, char **argv)
 	int rank;
 	int world_size;
 	int ret;
+	unsigned i, j;
 
 	/*
 	 *	Initialization
@@ -462,7 +463,14 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 	starpu_cublas_init();
 
@@ -594,6 +602,18 @@ int main(int argc, char **argv)
 	/*
 	 * 	Termination
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
+	free(dataA_handles);
+	free(dataA);
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

+ 13 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 	int rank;
 	int world_size;
 	int ret;
+	unsigned i, j;
 
 	starpu_srand48((long int)time(NULL));
 
@@ -376,6 +377,18 @@ int main(int argc, char **argv)
 	/*
 	 * 	Termination
 	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			TYPE *blockptr = dataA[j+i*nblocks];
+			if (blockptr != STARPU_POISON_PTR)
+				starpu_free(blockptr);
+		}
+	}
+	free(dataA_handles);
+	free(dataA);
 
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();

+ 26 - 11
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -39,9 +39,10 @@
 
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
+static size_t blocksize;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 static char *path = "./starpu-ooc-files";
@@ -53,6 +54,9 @@ static unsigned numa = 0;
 static size_t allocated_memory = 0;
 
 static starpu_data_handle_t *dataA_handles;
+static void **disk_objs;
+
+static int disk_node;
 
 int get_block_rank(unsigned i, unsigned j);
 
@@ -142,7 +146,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnbl
 
 static void create_matrix()
 {
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
 	TYPE *blockptr = malloc(blocksize);
 	int fd;
 	char *filename;
@@ -195,10 +198,9 @@ static void init_matrix(int rank)
 {
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	disk_objs = calloc(nblocks*nblocks, sizeof(*disk_objs));
 
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-
-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+	disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(16*1024*1024, size*size*sizeof(TYPE)));
 	assert(disk_node >= 0);
 
 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
@@ -215,21 +217,21 @@ static void init_matrix(int rank)
 
 			if (block_rank == rank)
 			{
-				void *disk_obj;
 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
 				/* Register it to StarPU */
-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
-				if (!disk_obj)
+				disk_objs[j+nblocks*i] = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_objs[j+nblocks*i])
 				{
 					fprintf(stderr,"could not open %s\n", filename);
 					exit(1);
 				}
 				starpu_matrix_data_register(handleptr, disk_node,
-					(uintptr_t) disk_obj, size/nblocks,
+					(uintptr_t) disk_objs[j+nblocks*i], size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 			}
 			else
 			{
+				disk_objs[j+nblocks*i] = NULL;
 				starpu_matrix_data_register(handleptr, -1,
 					0, size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
@@ -273,6 +275,8 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
+	blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
 	ret = mkdir(path, 0777);
 	if (ret != 0 && errno != EEXIST)
 	{
@@ -286,7 +290,14 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 	starpu_cublas_init();
 
@@ -401,8 +412,12 @@ int main(int argc, char **argv)
 		for (i = 0; i < nblocks; i++)
 		{
 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			if (disk_objs[j+nblocks*i])
+				starpu_disk_close(disk_node, disk_objs[j+nblocks*i], blocksize);
 		}
 	}
+	free(dataA_handles);
+	free(disk_objs);
 
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();

+ 18 - 14
mpi/src/mpi/starpu_mpi_mpi.c

@@ -203,7 +203,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			else
 			{
 				STARPU_ASSERT(req->count);
-				_STARPU_MPI_MALLOC(req->ptr, req->count);
+				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 			}
 
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
@@ -225,12 +225,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 
-			/* Case: a receive request for a data with the given tag and source has already been
-			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
-			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
-			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			{
+				/* Case: a receive request for a data with the given tag and source has already been
+				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+				 * will be called to bring the data back to the original data handle associated to the request.*/
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
 				while (!(early_data_handle->req_ready))
@@ -254,16 +254,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
+				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
-			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			{
 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				{
+					/* Case: we already received the send envelope, we can proceed with the receive */
 					req->sync = 1;
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					if (req->registered_datatype == 1)
@@ -275,14 +275,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 					{
 						req->count = sync_req->count;
 						STARPU_ASSERT(req->count);
-						_STARPU_MPI_MALLOC(req->ptr, req->count);
+						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 					}
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
+					/* Throw away the dumb request that was only used to know that we got the envelope */
 					_starpu_mpi_request_destroy(sync_req);
 				}
 				else
 				{
+					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_starpu_mpi_early_request_enqueue(req);
 				}
@@ -684,6 +686,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 
+	STARPU_VALGRIND_YIELD();
+
 #ifdef STARPU_SIMGRID
 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
 	if (*flag)
@@ -908,6 +912,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
+/* This is called when the data is now received in the early data handle, we can
+ * now copy it over to the real handle. */
 static void _starpu_mpi_early_data_cb(void* arg)
 {
 	struct _starpu_mpi_early_data_cb_args *args = arg;
@@ -954,7 +960,7 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	}
 
 	_STARPU_MPI_DEBUG(3, "Done, handling release of early_handle..\n");
-	starpu_data_release(args->early_handle);
+	starpu_data_release_on_node(args->early_handle, STARPU_MAIN_RAM);
 
 	_STARPU_MPI_DEBUG(3, "Done, handling unregister of early_handle..\n");
 	/* XXX: note that we have already freed the registered buffer above. In
@@ -1101,8 +1107,6 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
-		starpu_wake_all_blocked_workers();
-
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
@@ -1204,14 +1208,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
-	if (_starpu_mpi_thread_cpuid >= 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #else
@@ -1450,7 +1454,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						else
 						{
 							early_request->count = envelope->size;
-							_STARPU_MPI_MALLOC(early_request->ptr, early_request->count);
+							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
 
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);

+ 7 - 4
mpi/src/nmad/starpu_mpi_nmad.c

@@ -245,6 +245,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
+	STARPU_VALGRIND_YIELD();
+
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	/* we must do a test_locked to avoid race condition :
@@ -344,7 +346,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 				// req->ptr is freed by starpu_data_unpack
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 			else
-				free(req->ptr);
+				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
 		}
 		else
 		{
@@ -451,7 +453,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
 #ifndef STARPU_SIMGRID
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
@@ -623,7 +625,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	 * required for piom_ltask_set_bound_thread_indexes() */
 	_starpu_mpi_do_initialize(argc_argv);
 
-	if (_starpu_mpi_thread_cpuid < 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid < 0)
 	{
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
@@ -633,7 +635,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
+	if (!_starpu_mpi_nobind)
+		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
 
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -130,7 +130,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
 
-		req->ptr = malloc(req->count);
+		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 
 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);

+ 2 - 2
mpi/src/starpu_mpi_coop_sends.c

@@ -47,13 +47,13 @@ void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req)
 			/* We were last, release data */
 			free(coop_sends->reqs_array);
 			free(coop_sends);
-			starpu_data_release(req->data_handle);
+			starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 		}
 	}
 	else
 	{
 		/* Trivial request */
-		starpu_data_release(req->data_handle);
+		starpu_data_release_on_node(req->data_handle, STARPU_MAIN_RAM);
 	}
 }
 

+ 1 - 1
mpi/src/starpu_mpi_fxt.h

@@ -72,7 +72,7 @@ extern "C"
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
-	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), (handle), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
 	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\

+ 2 - 0
mpi/src/starpu_mpi_private.c

@@ -22,6 +22,7 @@ int _starpu_debug_level_max=0;
 int _starpu_mpi_tag = 42;
 int _starpu_mpi_comm_debug;
 
+int _starpu_mpi_nobind = -1;
 int _starpu_mpi_thread_cpuid = -1;
 int _starpu_mpi_use_prio = 1;
 int _starpu_mpi_fake_world_size = -1;
@@ -62,6 +63,7 @@ void _starpu_mpi_env_init(void)
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
+	_starpu_mpi_nobind = starpu_get_env_number_default("STARPU_MPI_NOBIND", 0);
 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);

+ 2 - 1
mpi/src/starpu_mpi_private.h

@@ -61,6 +61,7 @@ void _starpu_mpi_set_debug_level_max(int level);
 extern int _starpu_mpi_fake_world_size;
 extern int _starpu_mpi_fake_world_rank;
 extern int _starpu_mpi_use_prio;
+extern int _starpu_mpi_nobind;
 extern int _starpu_mpi_thread_cpuid;
 extern int _starpu_mpi_use_coop_sends;
 extern int _starpu_mpi_mem_throttle;
@@ -200,7 +201,7 @@ struct _starpu_mpi_data
 {
 	int magic;
 	struct _starpu_mpi_node_tag node_tag;
-	int *cache_sent;
+	char *cache_sent;
 	int cache_received;
 
 	/** Rendez-vous data for opportunistic cooperative sends */

+ 2 - 2
mpi/tests/Makefile.am

@@ -45,10 +45,10 @@ endif
 endif
 
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
 if STARPU_MPI_CHECK

+ 1 - 1
mpi/tests/broadcast.c

@@ -40,7 +40,7 @@ int main(int argc, char **argv)
 {
 	int ret, rank, size;
 	starpu_data_handle_t handle;
-	int var;
+	int var=-1;
 	int mpi_init;
 	MPI_Status status;
 

+ 18 - 2
mpi/tests/early_request.c

@@ -109,6 +109,18 @@ void submitted_order_fun(void *buffers[], void *cl_arg)
 	(void)cl_arg;
 }
 
+static struct starpu_codelet submitted_order_rw =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {submitted_order_fun, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+#ifdef STARPU_SIMGRID
+	.model = &starpu_perfmodel_nop,
+#endif
+	.name = "submitted_order_enforcer"
+};
+
 static struct starpu_codelet submitted_order =
 {
 	.where = STARPU_CPU,
@@ -156,11 +168,15 @@ void insert_work_for_one_element(struct element *el)
 			   STARPU_W,tmp_send,
 			   0);
 	//Send operation
-	starpu_insert_task(&submitted_order,
+	starpu_insert_task(&submitted_order_rw,
 			   STARPU_RW,el->ensure_submitted_order_send,
-			   STARPU_W,tmp_send,
+			   STARPU_RW,tmp_send,
 			   0);
 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
+	starpu_insert_task(&submitted_order_rw,
+			   STARPU_RW,el->ensure_submitted_order_send,
+			   STARPU_RW,tmp_send,
+			   0);
 
 	//Recv operation for current element
 	starpu_insert_task(&submitted_order,

+ 1 - 1
mpi/tests/insert_task_compute.c

@@ -47,7 +47,7 @@ int test(int rank, int node, int *before, int *after, int task_insert, int data_
 	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
-	if (starpu_cpu_worker_get_count() <= 0)
+	if (starpu_cpu_worker_get_count() == 0)
 	{
 		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
 		ret = -ENODEV;

+ 5 - 3
mpi/tests/pingpong.c

@@ -43,13 +43,14 @@ int main(int argc, char **argv)
 {
 	int ret, rank, size;
 	int mpi_init;
+	int i;
 
 	int niter = DEFAULT_NITER;
 	int data_size = DEFAULT_DATA_SIZE;
 	int sleep_time = DEFAULT_SLEEP_TIME;
 	int method = DEFAULT_METHOD;
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 		if (strcmp(argv[i], "-n") == 0)
 		{
@@ -134,6 +135,7 @@ int main(int argc, char **argv)
 	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int sender;
+	int r = 0;
 
 	if (method == 0) // ping pongs
 	{
@@ -161,7 +163,7 @@ int main(int argc, char **argv)
 			sender = loop % size;
 			if (sender == rank)
 			{
-				for (int r = 0; r < size; r++)
+				for (r = 0; r < size; r++)
 				{
 					if (r != rank)
 					{
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 				MPI_Status status;
 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
 
-				for (int r = 0; r < (size-1); r++)
+				for (r = 0; r < (size-1); r++)
 					starpu_sleep(sleep_time / 1000);
 			}
 		}

+ 11 - 0
src/common/fxt.c

@@ -27,6 +27,7 @@ unsigned long _starpu_job_cnt = 0;
 #ifdef STARPU_USE_FXT
 #include <common/fxt.h>
 #include <starpu_fxt.h>
+#include <sys/stat.h>
 
 #ifdef STARPU_HAVE_WINDOWS
 #include <windows.h>
@@ -95,6 +96,16 @@ static void _starpu_profile_set_tracefile(void)
 	char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
 	if (!fxt_prefix)
 	     fxt_prefix = "/tmp/";
+	else
+	{
+		// Check if the given folder really exists:
+		struct stat folder_stat;
+		if (stat(fxt_prefix, &folder_stat) < 0 || !S_ISDIR(folder_stat.st_mode))
+		{
+			_STARPU_MSG("%s is not a valid directory.\n", fxt_prefix);
+			_starpu_abort();
+		}
+	}
 
 	user = starpu_getenv("USER");
 	if (!user)

+ 2 - 0
src/common/utils.h

@@ -85,8 +85,10 @@
 #define _STARPU_UYIELD() ((void)0)
 #endif
 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
+#define STARPU_VALGRIND_YIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); } while (0)
 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
 #else
+#define STARPU_VALGRIND_YIELD() do { } while (0)
 #define STARPU_UYIELD() _STARPU_UYIELD()
 #endif
 

+ 4 - 16
src/core/dependencies/cg.c

@@ -169,7 +169,6 @@ int _starpu_list_tag_successors_in_cg_list(struct _starpu_cg_list *successors, u
 	return n;
 }
 
-/* Note: in case of a tag, it must be already locked */
 void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg)
 {
 	STARPU_ASSERT(cg);
@@ -208,6 +207,7 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				struct _starpu_tag *tag;
 
 				tag = cg->succ.tag;
+				_starpu_spin_lock(&tag->lock);
 				tag_successors = &tag->tag_successors;
 
 				tag_successors->ndeps_completed++;
@@ -219,8 +219,10 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 				{
 					/* reset the counter so that we can reuse the completion group */
 					tag_successors->ndeps_completed = 0;
+					/* This releases the lock */
 					_starpu_tag_set_ready(tag);
-				}
+				} else
+					_starpu_spin_unlock(&tag->lock);
 				break;
 			}
 
@@ -370,21 +372,7 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 			successors->nsuccs--;
 		}
 		_starpu_spin_unlock(&successors->lock);
-
-		struct _starpu_tag *cgtag = NULL;
-
-		if (cg_type == STARPU_CG_TAG)
-		{
-			cgtag = cg->succ.tag;
-			STARPU_ASSERT(cgtag);
-			_starpu_spin_lock(&cgtag->lock);
-		}
-
 		_starpu_notify_cg(pred, cg);
-
-		if (cg_type == STARPU_CG_TAG)
-			_starpu_spin_unlock(&cgtag->lock);
-
 		_starpu_spin_lock(&successors->lock);
 	}
 	successors->terminated = 1;

+ 31 - 9
src/core/dependencies/data_arbiter_concurrency.c

@@ -533,7 +533,7 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 }
 void ___starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 #else // LOCK_OR_DELEGATE
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 #endif
 {
 	starpu_arbiter_t arbiter = handle->arbiter;
@@ -546,10 +546,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 	{
 		/* No waiter, just remove our reference */
 		_starpu_spin_lock(&handle->header_lock);
-		STARPU_ASSERT(handle->refcnt > 0);
-		handle->refcnt--;
-		STARPU_ASSERT(handle->busy_count > 0);
-		handle->busy_count--;
+		if (down_to_mode == STARPU_NONE)
+		{
+			STARPU_ASSERT(handle->refcnt > 0);
+			handle->refcnt--;
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+		}
+		else
+		{
+			/* Downgrade from W or RW down to R, keeping the same reference,
+			 * but thus allowing other readers without allowing writers.  */
+			STARPU_ASSERT(down_to_mode == STARPU_R &&
+				      handle->current_mode == STARPU_W);
+			handle->current_mode = down_to_mode;
+		}
 #ifndef LOCK_OR_DELEGATE
 		STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
 #endif
@@ -562,10 +573,21 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
 	/* There is a waiter, remove our reference */
 	_starpu_spin_lock(&handle->header_lock);
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
+	if (down_to_mode == STARPU_NONE)
+	{
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+			      handle->current_mode == STARPU_W);
+		handle->current_mode = down_to_mode;
+	}
 	/* There should be at least one busy_count reference for the waiter
 	 * (thus we don't risk to see the handle disappear below) */
 	STARPU_ASSERT(handle->busy_count > 0);

+ 29 - 11
src/core/dependencies/data_concurrency.c

@@ -509,10 +509,16 @@ void _starpu_submit_job_take_data_deps(struct _starpu_job *j)
  * This may free the handle if it was lazily unregistered (1 is returned in
  * that case). The handle pointer thus becomes invalid for the caller.
  */
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode)
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 
+	if (down_to_mode != STARPU_NONE && handle->current_mode == down_to_mode)
+	{
+		/* No change, nothing to do */
+		return 0;
+	}
+
 	if (handle->arbiter)
 	{
 		/* Keep our reference for now, _starpu_notify_arbitered_dependencies
@@ -521,22 +527,34 @@ int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 		STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->reduction_req_list));
 		_starpu_spin_unlock(&handle->header_lock);
 		/* _starpu_notify_arbitered_dependencies will handle its own locking */
-		_starpu_notify_arbitered_dependencies(handle);
+		_starpu_notify_arbitered_dependencies(handle, down_to_mode);
 		/* We have already unlocked */
 		return 1;
 	}
 
-	/* A data access has finished so we remove a reference. */
-	STARPU_ASSERT(handle->refcnt > 0);
-	handle->refcnt--;
-	STARPU_ASSERT(handle->busy_count > 0);
-	handle->busy_count--;
-	if (_starpu_data_check_not_busy(handle))
-		/* Handle was destroyed, nothing left to do.  */
-		return 1;
-
 	STARPU_ASSERT(_starpu_data_requester_prio_list_empty(&handle->arbitered_req_list));
 
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* A data access has finished so we remove a reference. */
+		STARPU_ASSERT(handle->refcnt > 0);
+		handle->refcnt--;
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (_starpu_data_check_not_busy(handle))
+			/* Handle was destroyed, nothing left to do.  */
+			return 1;
+	}
+	else
+	{
+		/* Downgrade from W or RW down to R, keeping the same reference,
+		 * but thus allowing other readers without allowing writers.  */
+		STARPU_ASSERT(down_to_mode == STARPU_R &&
+				(handle->current_mode == STARPU_RW ||
+				 handle->current_mode == STARPU_W));
+		handle->current_mode = down_to_mode;
+	}
+
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
 	if (handle->reduction_refcnt > 0)

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -28,8 +28,8 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 void _starpu_submit_job_take_data_deps(struct _starpu_job *j);
 void _starpu_enforce_data_deps_notify_job_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
-int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
-void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
+void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle, enum starpu_data_access_mode down_to_mode);
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_data_access_mode mode,

+ 14 - 4
src/core/dependencies/implicit_data_deps.c

@@ -225,10 +225,13 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 
+		if (mode & STARPU_R)
+			STARPU_ASSERT_MSG(handle->initialized || handle->init_cl, "Handle %p is not initialized, it cannot be read", handle);
+
 		if (mode & STARPU_W || mode == STARPU_REDUX)
 		{
 
-			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handles can not be written to");
+			STARPU_ASSERT_MSG(!handle->readonly, "Read-only handle %p can not be written to", handle);
 
 			handle->initialized = 1;
 			/* We will change our value, disconnect from our readonly duplicates */
@@ -613,18 +616,25 @@ void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data
         _STARPU_LOG_OUT();
 }
 
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
+	unsigned last_cnt;
 
 	/* Here helgrind would shout that this is an unprotected access, but
 	 * count can only be zero if we don't have to care about
 	 * post_sync_tasks_cnt at all.  */
-	if (STARPU_RUNNING_ON_VALGRIND || handle->post_sync_tasks_cnt)
+	if (handle->post_sync_tasks_cnt)
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		if (--handle->post_sync_tasks_cnt == 0)
+		last_cnt = handle->post_sync_tasks_cnt;
+
+		if (mode == STARPU_NONE)
+			/* Last release from us */
+			handle->post_sync_tasks_cnt--;
+
+		if (last_cnt == 1)
 		{
 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
 			do_submit_tasks = 1;

+ 1 - 1
src/core/dependencies/implicit_data_deps.h

@@ -30,7 +30,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
-void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 
 /** Register a hook to be called when a write is submitted */
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));

+ 16 - 11
src/core/dependencies/tags.c

@@ -120,13 +120,23 @@ static void _starpu_tag_free(void *_tag)
 			unsigned STARPU_ATTRIBUTE_UNUSED remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 
 			if (!ntags && (cg->cg_type == STARPU_CG_TAG))
+			{
 				/* Last tag this cg depends on, cg becomes unreferenced */
+#ifdef STARPU_DEBUG
+				free(cg->deps);
+				free(cg->done);
+#endif
 				free(cg);
+			}
 		}
 
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 		free(tag->tag_successors.succ);
 #endif
+#ifdef STARPU_DEBUG
+		free(tag->tag_successors.deps);
+		free(tag->tag_successors.done);
+#endif
 
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_destroy(&tag->lock);
@@ -221,7 +231,7 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 	return tag;
 }
 
-/* lock should be taken */
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag)
 {
 	/* mark this tag as ready to run */
@@ -229,6 +239,10 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* declare it to the scheduler ! */
 	struct _starpu_job *j = tag->job;
 
+	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
+	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
+	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
+
 	/* In case the task job is going to be scheduled immediately, and if
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the dependencies of the task, and therefore it would try to grab the
@@ -238,14 +252,9 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 	/* enforce data dependencies */
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	_starpu_enforce_deps_starting_from_task(j);
-
-	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT(!STARPU_AYU_EVENT || tag->id < STARPU_AYUDAME_OFFSET);
-	STARPU_AYU_PRERUNTASK(tag->id + STARPU_AYUDAME_OFFSET, -1);
-	STARPU_AYU_POSTRUNTASK(tag->id + STARPU_AYUDAME_OFFSET);
 }
 
-/* the lock must be taken ! */
+/* the lock of the tag must already be taken ! */
 static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 {
 	STARPU_ASSERT(tag);
@@ -396,12 +405,10 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 }
@@ -434,12 +441,10 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
-		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || dep_id < STARPU_AYUDAME_OFFSET);
 		STARPU_ASSERT(!STARPU_AYU_EVENT || id < STARPU_AYUDAME_OFFSET);
 		STARPU_AYU_ADDDEPENDENCY(dep_id+STARPU_AYUDAME_OFFSET, 0, id+STARPU_AYUDAME_OFFSET);
-		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	va_end(pa);

+ 2 - 0
src/core/dependencies/tags.h

@@ -68,6 +68,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data);
 
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
+
+/* lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);

+ 1 - 1
src/core/jobs.c

@@ -82,7 +82,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 	job->task = task;
 
-#ifndef STARPU_USE_FXT
+#if !defined(STARPU_USE_FXT) && !defined(STARPU_DEBUG)
 	if (_starpu_bound_recording || _starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1 || STARPU_AYU_EVENT)
 #endif
 	{

+ 3 - 0
src/core/perfmodel/multiple_regression.c

@@ -236,6 +236,9 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, unsigned long nn, unsigne
 	if( info != 0 )
 	{
 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
+		free(X);
+		free(Y);
+		free(work);
 		return 1;
 	}
 

+ 13 - 10
src/datawizard/coherency.c

@@ -862,7 +862,7 @@ uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 
 /* in case the data was accessed on a write mode, do not forget to
  * make it accessible again once it is possible ! */
-void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
+void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
 {
 	uint32_t wt_mask;
 	wt_mask = default_wt_mask | handle->wt_mask;
@@ -887,14 +887,17 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
 
-	/* Release refcnt taken by fetch_data_on_node */
-	replicate->refcnt--;
-	STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
+	if (down_to_mode == STARPU_NONE)
+	{
+		/* Release refcnt taken by fetch_data_on_node */
+		replicate->refcnt--;
+		STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
 
-	STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
-	handle->busy_count--;
+		STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
+		handle->busy_count--;
+	}
 
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, down_to_mode))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
@@ -1229,7 +1232,7 @@ enomem:
 
 		local_replicate = get_replicate(handle, mode, workerid, node);
 
-		_starpu_release_data_on_node(handle, 0, local_replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 	}
 
 	return -1;
@@ -1338,13 +1341,13 @@ void __starpu_push_task_output(struct _starpu_job *j)
 		if (node == -1)
 		{
 			/* NOWHERE case, just notify dependencies */
-			if (!_starpu_notify_data_dependencies(handle))
+			if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 		else
 		{
 			_starpu_spin_unlock(&handle->header_lock);
-			_starpu_release_data_on_node(handle, 0, local_replicate);
+			_starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
 		}
 	}
 

+ 1 - 0
src/datawizard/coherency.h

@@ -316,6 +316,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
+				  enum starpu_data_access_mode down_to_mode,
 				  struct _starpu_data_replicate *replicate);
 
 void _starpu_update_data_state(starpu_data_handle_t handle,

+ 4 - 4
src/datawizard/copy_driver.c

@@ -375,8 +375,8 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
 	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
 
-	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
-	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+	if (ld2_src == blocksize * numblocks_1 &&
+	    ld2_dst == blocksize * numblocks_1)
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,
@@ -425,8 +425,8 @@ int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
 	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
 	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
 
-	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
-	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+	if (ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	    ld3_dst == blocksize * numblocks_1 * numblocks_2)
 		/* Optimize contiguous case */
 		return starpu_interface_copy(src, src_offset, src_node,
 					     dst, dst_offset, dst_node,

+ 6 - 2
src/datawizard/interfaces/data_interface.c

@@ -749,12 +749,12 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	{
 		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, handle->home_node, replicate);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate);
 	}
 	else
 	{
 		_starpu_spin_lock(&handle->header_lock);
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 }
@@ -836,6 +836,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency && !nowait)
 	{
+		/* We will acquire it in write mode to catch all dependencies,
+		 * but possibly it's not actually initialized. Fake it to avoid
+		 getting caught doing it */
+		handle->initialized = 1;
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_data_unregister must not be called from a task or callback, perhaps you can use starpu_data_unregister_submit instead");
 
 		/* If sequential consistency is enabled, wait until data is available */

+ 31 - 8
src/datawizard/user_interactions.c

@@ -485,16 +485,26 @@ int starpu_data_acquire_try(starpu_data_handle_t handle, enum starpu_data_access
 
 /* This function must be called after starpu_data_acquire so that the
  * application release the data */
-void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+void starpu_data_release_to_on_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int node)
 {
 	STARPU_ASSERT(handle);
 
+	if (mode == STARPU_RW)
+		/* They are equivalent here, and current_mode is never STARPU_RW */
+		mode = STARPU_W;
+
+	STARPU_ASSERT_MSG(mode == STARPU_NONE ||
+			  mode == handle->current_mode ||
+			  (mode == STARPU_R &&
+			     handle->current_mode == STARPU_W),
+		"We only support releasing from W to R");
+
 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
-	_starpu_unlock_post_sync_tasks(handle);
+	_starpu_unlock_post_sync_tasks(handle, mode);
 
 	/* The application can now release the rw-lock */
 	if (node >= 0)
-		_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
+		_starpu_release_data_on_node(handle, 0, mode, &handle->per_node[node]);
 	else
 	{
 		_starpu_spin_lock(&handle->header_lock);
@@ -505,17 +515,27 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 				handle->per_node[i].refcnt--;
 		}
 		handle->busy_count--;
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, mode))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 }
 
-void starpu_data_release(starpu_data_handle_t handle)
+void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
+{
+	starpu_data_release_to_on_node(handle, STARPU_NONE, node);
+}
+
+void starpu_data_release_to(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {
 	int home_node = handle->home_node;
 	if (home_node < 0)
 		home_node = STARPU_MAIN_RAM;
-	starpu_data_release_on_node(handle, home_node);
+	starpu_data_release_to_on_node(handle, mode, home_node);
+}
+
+void starpu_data_release(starpu_data_handle_t handle)
+{
+	starpu_data_release_to(handle, STARPU_NONE);
 }
 
 static void _prefetch_data_on_node(void *arg)
@@ -531,7 +551,7 @@ static void _prefetch_data_on_node(void *arg)
 		_starpu_data_acquire_wrapper_finished(wrapper);
 
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
@@ -581,7 +601,7 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 		/* In case there was a temporary handle (eg. used for reduction), this
 		 * handle may have requested to be destroyed when the data is released
 		 * */
-		if (!_starpu_notify_data_dependencies(handle))
+		if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	else if (!async)
@@ -666,6 +686,9 @@ static void _starpu_data_wont_use(void *data)
 
 void starpu_data_wont_use(starpu_data_handle_t handle)
 {
+	if (!handle->initialized)
+		/* No value atm actually */
+		return;
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 }

+ 1 - 1
src/datawizard/write_back.c

@@ -24,7 +24,7 @@ static void wt_callback(void *arg)
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 
 	_starpu_spin_lock(&handle->header_lock);
-	if (!_starpu_notify_data_dependencies(handle))
+	if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 

+ 2 - 2
src/debug/latency.c

@@ -36,7 +36,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
 		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node0, replicate_0);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_0);
 
 		_starpu_spin_lock(&handle->header_lock);
 		handle->refcnt++;
@@ -46,6 +46,6 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
 		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
-		_starpu_release_data_on_node(handle, node1, replicate_1);
+		_starpu_release_data_on_node(handle, 0, STARPU_NONE, replicate_1);
 	}
 }

+ 19 - 13
src/debug/traces/starpu_fxt.c

@@ -333,7 +333,7 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 	return data;
 }
 
-static void handle_papi_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+static void handle_papi_event(struct fxt_ev_64 *ev STARPU_ATTRIBUTE_UNUSED, struct starpu_fxt_options *options STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_PAPI
 	int event_code = ev->param[0];
@@ -2307,9 +2307,11 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			snprintf(paje_key, sizeof(paje_key), "com_%u", comid);
 			program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, src);
-			poti_StartLink(time, program_container, link_type, src_memnode_container, paje_value, paje_key);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", handle);
+			poti_user_StartLink(_starpu_poti_CommLinkStart, time, program_container, link_type, src_memnode_container, paje_value, paje_key, 1, str_handle);
 #else
-			fprintf(out_paje_file, "18	%.9f	%s	%sp	%u	%smm%u	com_%u\n", time, link_type, prefix, size, prefix, src, comid);
+			fprintf(out_paje_file, "24	%.9f	%s	%sp	%u	%smm%u	com_%u	%lx\n", time, link_type, prefix, size, prefix, src, comid, handle);
 #endif
 		}
 
@@ -2636,10 +2638,12 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
                char paje_value[STARPU_POTI_STR_LEN];
                snprintf(paje_value, sizeof(paje_value), "%u", task);
                snprintf(container, sizeof(container), "%sp", options->file_prefix);
-               poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
 #else
-	       fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
-               fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
+		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
+		if (!options->no_events)
+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
 #endif
 	}
 
@@ -2652,9 +2656,9 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		if (options->file_rank < 0)
-			fprintf(sched_tasks_file, "JobId: %d\n", task);
+			fprintf(sched_tasks_file, "JobId: %u\n", task);
 		else
-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
 		fprintf(sched_tasks_file, "\n");
 	}
 }
@@ -2681,11 +2685,13 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		char paje_value[STARPU_POTI_STR_LEN];
 		snprintf(paje_value, sizeof(paje_value), "%u", task);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
-		poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
 #else
 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
-		fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
+		if (!options->no_events)
+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "po", options->file_prefix, task);
 #endif
 	}
 
@@ -2701,9 +2707,9 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		fprintf(sched_tasks_file, "Time: %.9f\n", current_timestamp);
 		fprintf(sched_tasks_file, "Priority: %d\n", priority);
 		if (options->file_rank < 0)
-			fprintf(sched_tasks_file, "JobId: %d\n", task);
+			fprintf(sched_tasks_file, "JobId: %u\n", task);
 		else
-			fprintf(sched_tasks_file, "JobId: %d_%d\n", options->file_rank, task);
+			fprintf(sched_tasks_file, "JobId: %d_%u\n", options->file_rank, task);
 		fprintf(sched_tasks_file, "\n");
 	}
 }
@@ -4450,7 +4456,7 @@ void _starpu_fxt_number_events_file_close(void)
 		for (i = 0; i <= FUT_SETUP_CODE; i++)
 		{
 			if (number_events[i] > 0)
-				fprintf(number_events_file, "0x%x\t%lu\n", i, number_events[i]);
+				fprintf(number_events_file, "0x%x\t%"PRIu64"\n", i, number_events[i]);
 		}
 
 		free(number_events);

+ 1 - 0
src/debug/traces/starpu_fxt.h

@@ -68,6 +68,7 @@ void _starpu_fxt_write_paje_header(FILE *file, struct starpu_fxt_options *option
 extern int _starpu_poti_extendedSetState;
 extern int _starpu_poti_semiExtendedSetState;
 extern int _starpu_poti_MemoryEvent;
+extern int _starpu_poti_CommLinkStart;
 extern int _starpu_poti_MpiLinkStart;
 
 /*

+ 8 - 6
src/debug/traces/starpu_fxt_mpi.c

@@ -331,7 +331,9 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 			char str_mpi_tag[STARPU_POTI_STR_LEN];
 			snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
-			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
+			char str_handle[STARPU_POTI_STR_LEN];
+			snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
+			poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 2, str_mpi_tag, str_handle);
 
 			poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
 			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
@@ -340,7 +342,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 #else
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo_mpi	%f\n", start_date, src, current_out_bandwidth[src]);
 			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi_mpi	%f\n", start_date, dst, current_in_bandwidth[dst]);
-			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
+			fprintf(out_paje_file, "23	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu	%ld	%lx\n", start_date, (unsigned long)size, src, id, mpi_tag, send_handle);
 			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
 #endif
 
@@ -354,10 +356,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 				fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
 				fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
 				if (cur->jobid != -1)
-					fprintf(out_comms_file, "SendJobId: %d_%lu\n", src, cur->jobid);
+					fprintf(out_comms_file, "SendJobId: %d_%ld\n", src, cur->jobid);
 				if (match->jobid != -1)
-					fprintf(out_comms_file, "RecvJobId: %d_%lu\n", dst, match->jobid);
-				fprintf(out_comms_file, "Size: %ld\n", size);
+					fprintf(out_comms_file, "RecvJobId: %d_%ld\n", dst, match->jobid);
+				fprintf(out_comms_file, "Size: %lu\n", (unsigned long)size);
 				fprintf(out_comms_file, "\n");
 			}
 		}
@@ -372,7 +374,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 	if (nb_wrong_comm_timing == 1)
 		_STARPU_MSG("Warning: a communication finished before it started !\n");
 	else if (nb_wrong_comm_timing > 1)
-		_STARPU_MSG("Warning: %d communications finished before they started !\n", nb_wrong_comm_timing);
+		_STARPU_MSG("Warning: %u communications finished before they started !\n", nb_wrong_comm_timing);
 }
 
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)

+ 7 - 4
src/debug/traces/starpu_paje.c

@@ -28,6 +28,7 @@
 int _starpu_poti_extendedSetState = -1;
 int _starpu_poti_semiExtendedSetState = -1;
 int _starpu_poti_MemoryEvent = -1;
+int _starpu_poti_CommLinkStart = -1;
 int _starpu_poti_MpiLinkStart = -1;
 #endif
 #endif
@@ -62,6 +63,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 						     "SubmitOrder string"
 						     );
 #ifdef HAVE_POTI_USER_NEWEVENT
+	_starpu_poti_CommLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "Handle string");
 	if (options->memory_states)
 	{
 		_starpu_poti_MemoryEvent = poti_header_DeclareEvent (PAJE_NewEvent,
@@ -71,7 +73,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 							     "Size string",
 							     "Dest string");
 	}
-	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 1, "MPITAG string");
+	_starpu_poti_MpiLinkStart = poti_header_DeclareEvent(PAJE_StartLink, 2, "MPITAG string", "Handle string");
 #endif
 #else
 	poti_header(1,1);
@@ -230,15 +232,16 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
-	fprintf(file, "%%EventDef	PajeEndLink	24\n");
+	fprintf(file, "%%EventDef	PajeStartLink	24\n");
 	fprintf(file, "%%	Time	date\n");
 	fprintf(file, "%%	Type	string\n");
 	fprintf(file, "%%	Container	string\n");
 	fprintf(file, "%%	Value	string\n");
-	fprintf(file, "%%	EndContainer	string\n");
+	fprintf(file, "%%	StartContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
-	fprintf(file, "%%	MPITAG	string\n");
+	fprintf(file, "%%	Handle	string\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif
 

+ 30 - 25
src/sched_policies/component_work_stealing.c

@@ -30,14 +30,20 @@
 #warning TODO: locality work-stealing
 #endif
 
+struct _starpu_component_work_stealing_data_per_worker
+{
+	struct _starpu_prio_deque fifo;
+	unsigned last_pop_child;
+};
+
 struct _starpu_component_work_stealing_data
 {
 /* keep track of the work performed from the beginning of the algorithm to make
  * better decisions about which queue to child when stealing or deferring work
  */
-	unsigned performed_total, last_pop_child, last_push_child;
+	struct _starpu_component_work_stealing_data_per_worker *per_worker;
+	unsigned performed_total, last_push_child;
 
-	struct _starpu_prio_deque * fifos;
 	starpu_pthread_mutex_t ** mutexes;
 	unsigned size;
 };
@@ -50,16 +56,14 @@ struct _starpu_component_work_stealing_data
 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
 {
 	struct _starpu_component_work_stealing_data *wsd = component->data;
-	STARPU_HG_DISABLE_CHECKING(wsd->last_pop_child);
-	unsigned i = wsd->last_pop_child;
-	wsd->last_pop_child = (i + 1) % component->nchildren;
-	STARPU_HG_ENABLE_CHECKING(wsd->last_pop_child);
+	unsigned i = wsd->per_worker[workerid].last_pop_child;
+	wsd->per_worker[workerid].last_pop_child = (i + 1) % component->nchildren;
 	/* If the worker's queue have no suitable tasks, let's try
 	 * the next ones */
 	struct starpu_task * task = NULL;
 	while (1)
 	{
-		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
+		struct _starpu_prio_deque * fifo = &wsd->per_worker[i].fifo;
 
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
@@ -75,7 +79,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 			break;
 		}
 
-		if (i == wsd->last_pop_child)
+		if (i == wsd->per_worker[workerid].last_pop_child)
 		{
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
@@ -141,17 +145,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	const double now = starpu_timing_now();
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->per_worker[i].fifo);
 	if(task)
 	{
 		if(!isnan(task->predicted))
 		{
-			wsd->fifos[i].exp_len -= task->predicted;
-			wsd->fifos[i].exp_start = now + task->predicted;
+			wsd->per_worker[i].fifo.exp_len -= task->predicted;
+			wsd->per_worker[i].fifo.exp_start = now + task->predicted;
 		}
 	}
 	else
-		wsd->fifos[i].exp_len = 0.0;
+		wsd->per_worker[i].fifo.exp_len = 0.0;
 
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	if(task)
@@ -163,7 +167,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	if(task)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		wsd->fifos[i].nprocessed++;
+		wsd->per_worker[i].fifo.nprocessed++;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 		return task;
@@ -196,9 +200,9 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		sum_len += wsd->fifos[i].exp_len;
-		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
-		sum_start += wsd->fifos[i].exp_start;
+		sum_len += wsd->per_worker[i].fifo.exp_len;
+		wsd->per_worker[i].fifo.exp_start = STARPU_MAX(now, wsd->per_worker[i].fifo.exp_start);
+		sum_start += wsd->per_worker[i].fifo.exp_start;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	}
@@ -216,7 +220,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		ntasks += wsd->fifos[i].ntasks;
+		ntasks += wsd->per_worker[i].fifo.ntasks;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	}
 	double speedup = 0.0;
@@ -265,7 +269,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	starpu_sched_task_break(task);
-	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
+	ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo, task);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	wsd->last_push_child = i;
@@ -308,9 +312,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
 			struct _starpu_component_work_stealing_data * wsd = component->data;
 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
+			int ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo , task);
 			if(ret == 0 && !isnan(task->predicted))
-				wsd->fifos[i].exp_len += task->predicted;
+				wsd->per_worker[i].fifo.exp_len += task->predicted;
 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 			component->can_pull(component);
@@ -329,12 +333,13 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 	if(wsd->size < component->nchildren)
 	{
 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
-		_STARPU_REALLOC(wsd->fifos, component->nchildren * sizeof(*wsd->fifos));
+		_STARPU_REALLOC(wsd->per_worker, component->nchildren * sizeof(*wsd->per_worker));
 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
 		wsd->size = component->nchildren;
 	}
 
-	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
+	wsd->per_worker[component->nchildren - 1].last_pop_child = 0;
+	_starpu_prio_deque_init(&wsd->per_worker[component->nchildren - 1].fifo);
 
 	starpu_pthread_mutex_t *mutex;
 	_STARPU_MALLOC(mutex, sizeof(*mutex));
@@ -356,8 +361,8 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 			break;
 	}
 	STARPU_ASSERT(i_component != component->nchildren);
-	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
-	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
+	struct _starpu_prio_deque tmp_fifo = wsd->per_worker[i_component].fifo;
+	wsd->per_worker[i_component].fifo = wsd->per_worker[component->nchildren - 1].fifo;
 
 
 	component->children[i_component] = component->children[component->nchildren - 1];
@@ -372,7 +377,7 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
 {
 	struct _starpu_component_work_stealing_data * wsd = component->data;
-	free(wsd->fifos);
+	free(wsd->per_worker);
 	free(wsd->mutexes);
 	free(wsd);
 }

+ 68 - 22
src/sched_policies/work_stealing_policy.c

@@ -71,10 +71,21 @@ struct locality_entry
 
 struct _starpu_work_stealing_data_per_worker
 {
+	char fill1[STARPU_CACHELINE_SIZE];
+	/* This is read-mostly, only updated when the queue becomes empty or
+	 * becomes non-empty, to make it generally cheap to check */
+	unsigned notask;	/* whether the queue is empty */
+	char fill2[STARPU_CACHELINE_SIZE];
+
 	struct _starpu_prio_deque queue;
 	int running;
 	int *proxlist;
-	int busy;
+	int busy;	/* Whether this worker is working on a task */
+
+	/* keep track of the work performed from the beginning of the algorithm to make
+	 * better decisions about which queue to select when deferring work
+	 */
+	unsigned last_pop_worker;
 
 #ifdef USE_LOCALITY_TASKS
 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
@@ -93,9 +104,8 @@ struct _starpu_work_stealing_data
 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
 	struct _starpu_work_stealing_data_per_worker *per_worker;
 	/* keep track of the work performed from the beginning of the algorithm to make
-	 * better decisions about which queue to select when stealing or deferring work
+	 * better decisions about which queue to select when deferring work
 	 */
-	unsigned last_pop_worker;
 	unsigned last_push_worker;
 };
 
@@ -118,7 +128,8 @@ static int calibration_value = 0;
  */
 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
 {
-	unsigned worker = ws->last_pop_worker;
+	unsigned workerid = starpu_worker_get_id_check();
+	unsigned worker = ws->per_worker[workerid].last_pop_worker;
 	unsigned nworkers;
 	int *workerids = NULL;
 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
@@ -131,14 +142,17 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		/* Here helgrind would shout that this is unprotected, but we
 		 * are fine with getting outdated values, this is just an
 		 * estimation */
-		ntasks = ws->per_worker[workerids[worker]].queue.ntasks;
-
-		if (ntasks && (ws->per_worker[workerids[worker]].busy
-					   || starpu_worker_is_blocked_in_parallel(workerids[worker])))
-			break;
+		if (!ws->per_worker[workerids[worker]].notask)
+		{
+			if (ws->per_worker[workerids[worker]].busy
+						   || starpu_worker_is_blocked_in_parallel(workerids[worker])) {
+				ntasks = 1;
+				break;
+			}
+		}
 
 		worker = (worker + 1) % nworkers;
-		if (worker == ws->last_pop_worker)
+		if (worker == ws->per_worker[workerid].last_pop_worker)
 		{
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
@@ -147,7 +161,7 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		}
 	}
 
-	ws->last_pop_worker = (worker + 1) % nworkers;
+	ws->per_worker[workerid].last_pop_worker = (worker + 1) % nworkers;
 
 	worker = workerids[worker];
 
@@ -327,15 +341,31 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 	{
 		/* found an interesting task, try to pick it! */
 		if (_starpu_prio_deque_pop_this_task(&data_source->queue, target, best_task))
+		{
+			if (!data_source->queue.ntasks)
+			{
+				STARPU_ASSERT(ws->per_worker[source].notask == 0);
+				ws->per_worker[source].notask = 1;
+			}
 			return best_task;
+		}
 	}
 
 	/* Didn't find an interesting task, or couldn't run it :( */
 	int skipped;
+	struct starpu_task *task;
+
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&data_source->queue, target, &skipped);
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
+
+	if (task && !data_source->queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 
 /* Called when popping a task from a queue */
@@ -371,10 +401,18 @@ static void locality_pushed_task(struct _starpu_work_stealing_data *ws STARPU_AT
 static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, int source, int target)
 {
 	int skipped;
+	struct starpu_task *task;
 	if (source != target)
-		return _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_deque_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
 	else
-		return _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
+
+	if (task && !ws->per_worker[source].queue.ntasks)
+	{
+		STARPU_ASSERT(ws->per_worker[source].notask == 0);
+		ws->per_worker[source].notask = 1;
+	}
+	return task;
 }
 /* Called when popping a task from a queue */
 static void locality_popped_task(struct _starpu_work_stealing_data *ws STARPU_ATTRIBUTE_UNUSED, struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
@@ -530,7 +568,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	struct starpu_task *task = NULL;
 	unsigned workerid = starpu_worker_get_id_check();
 
-	ws->per_worker[workerid].busy = 0;
+	if (ws->per_worker[workerid].busy)
+		ws->per_worker[workerid].busy = 0;
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 	if (STARPU_RUNNING_ON_VALGRIND || !_starpu_prio_deque_is_empty(&ws->per_worker[workerid].queue))
@@ -617,7 +656,8 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 		if (!task)
 			return NULL;
 	}
-	ws->per_worker[workerid].busy = !!task;
+	if (ws->per_worker[workerid].busy != !!task)
+		ws->per_worker[workerid].busy = !!task;
 	return task;
 }
 
@@ -648,6 +688,11 @@ int ws_push_task(struct starpu_task *task)
 	record_data_locality(task, workerid);
 	STARPU_ASSERT_MSG(ws->per_worker[workerid].running, "workerid=%d, ws=%p\n", workerid, ws);
 	_starpu_prio_deque_push_back_task(&ws->per_worker[workerid].queue, task);
+	if (ws->per_worker[workerid].queue.ntasks == 1)
+	{
+		STARPU_ASSERT(ws->per_worker[workerid].notask == 1);
+		ws->per_worker[workerid].notask = 0;
+	}
 	locality_pushed_task(ws, task, workerid, sched_ctx_id);
 
 	starpu_push_task_end(task);
@@ -676,10 +721,12 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 		int workerid = workerids[i];
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		_starpu_prio_deque_init(&ws->per_worker[workerid].queue);
+		ws->per_worker[workerid].notask = 1;
 		ws->per_worker[workerid].running = 1;
 
 		/* Tell helgrind that we are fine with getting outdated values,
 		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].notask);
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].queue.ntasks);
 		ws->per_worker[workerid].busy = 0;
 		STARPU_HG_DISABLE_CHECKING(ws->per_worker[workerid].busy);
@@ -708,9 +755,7 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
 
-	ws->last_pop_worker = 0;
 	ws->last_push_worker = 0;
-	STARPU_HG_DISABLE_CHECKING(ws->last_pop_worker);
 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
 	ws->select_victim = select_victim;
 
@@ -760,11 +805,12 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 	for (i = 0; i < nworkers; i++)
 	{
 		int neighbor = ws->per_worker[workerid].proxlist[i];
+		if (ws->per_worker[neighbor].notask)
+			continue;
                 /* FIXME: do not keep looking again and again at some worker
                  * which has tasks, but that can't execute on me */
-		int ntasks = ws->per_worker[neighbor].queue.ntasks;
-		if (ntasks && (ws->per_worker[neighbor].busy
-					   || starpu_worker_is_blocked_in_parallel(neighbor)))
+		if (ws->per_worker[neighbor].busy
+					   || starpu_worker_is_blocked_in_parallel(neighbor))
 			return neighbor;
 	}
 	return -1;

+ 1 - 2
src/util/execute_on_all.c

@@ -107,8 +107,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 	unsigned nworkers = starpu_worker_get_count();
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 
-	/* This method only work on CPU, CUDA, OPENCL */
-	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
+	STARPU_ASSERT_MSG((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0, "This function is implemented only on CPU, CUDA, OpenCL");
 
 	/* create a wrapper codelet */
 	struct starpu_codelet wrapper_cl =

+ 1 - 0
src/util/starpu_data_cpy.c

@@ -159,6 +159,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 
+	/* FIXME: priority!! */
 	STARPU_TASK_SET_HANDLE(task, dst_handle, 0);
 	STARPU_TASK_SET_HANDLE(task, src_handle, 1);
 

+ 18 - 1
tests/Makefile.am

@@ -43,6 +43,7 @@ EXTRA_DIST =					\
 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
+	microbenchs/bandwidth_scheds.sh		\
 	energy/static.sh			\
 	energy/dynamic.sh			\
 	energy/perfs.gp				\
@@ -73,7 +74,7 @@ EXTRA_DIST =					\
 	model-checking/starpu-mc.sh.in
 
 CLEANFILES = 					\
-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90
+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat bandwidth.gp bandwidth.eps bandwidth.svg
 
 BUILT_SOURCES =
 SUBDIRS =
@@ -268,6 +269,7 @@ myPROGRAMS +=				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
+	datawizard/acquire_release_to		\
 	datawizard/acquire_try			\
 	datawizard/bcsr				\
 	datawizard/cache			\
@@ -355,6 +357,7 @@ myPROGRAMS +=				\
 	microbenchs/prefetch_data_on_node 	\
 	microbenchs/redundant_buffer		\
 	microbenchs/matrix_as_vector		\
+	microbenchs/bandwidth			\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
@@ -431,6 +434,8 @@ examplebin_SCRIPTS = \
 	microbenchs/tasks_size_overhead.sh
 if !STARPU_SIMGRID
 if !STARPU_USE_MPI_MASTER_SLAVE
+examplebin_PROGRAMS += \
+	microbenchs/bandwidth
 SHELL_TESTS += \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \
@@ -454,6 +459,7 @@ endif
 if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS += \
 	datawizard/locality.sh \
+	microbenchs/bandwidth_scheds.sh \
 	overlap/overlap.sh
 endif
 
@@ -505,6 +511,17 @@ datawizard_acquire_release2_SOURCES +=		\
 	datawizard/acquire_release_opencl.c
 endif
 
+datawizard_acquire_release_to_SOURCES =		\
+	datawizard/acquire_release_to.c
+if STARPU_USE_CUDA
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_acquire_release_to_SOURCES +=		\
+	datawizard/acquire_release_opencl.c
+endif
+
 datawizard_scratch_SOURCES =			\
 	datawizard/scratch.c
 if STARPU_USE_CUDA

+ 214 - 0
tests/datawizard/acquire_release_to.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Check that _release_to correctly interacts with tasks working on the same data
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 10;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned ntasks = 1000;
+#else
+static unsigned ntasks = 10000;
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void increment_opencl(void *buffers[], void *args);
+#endif
+
+void increment_cpu(void *descr[], void *arg)
+{
+	(void)arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {increment_cpu},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
+#endif
+	.cpu_funcs_name = {"increment_cpu"},
+	.nbuffers = 1
+};
+
+void check_cpu(void *descr[], void *arg)
+{
+	unsigned *val = arg;
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	STARPU_ASSERT(*tokenptr == *val);
+}
+
+static struct starpu_codelet check_cl =
+{
+	.modes = { STARPU_R },
+	.cpu_funcs = {check_cpu},
+	.cpu_funcs_name = {"check_cpu"},
+	.nbuffers = 1
+};
+
+unsigned token = 0;
+starpu_data_handle_t token_handle;
+
+static
+int increment_token(void)
+{
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+int check_token(unsigned value)
+{
+	unsigned *value_p;
+	int ret;
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &check_cl;
+	task->handles[0] = token_handle;
+	task->cl_arg = value_p = malloc(sizeof(*value_p));
+	task->cl_arg_size = sizeof(*value_p);
+	task->cl_arg_free = 1;
+	*value_p = value;
+	ret = starpu_task_submit(task);
+	return ret;
+}
+
+static
+void callback(void *arg)
+{
+	(void)arg;
+	token++;
+	starpu_data_release_to(token_handle, STARPU_W);
+	starpu_sleep(0.001);
+	starpu_data_release_to(token_handle, STARPU_R);
+	starpu_sleep(0.001);
+	starpu_data_release(token_handle);
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+#endif
+int main(int argc, char **argv)
+{
+	unsigned i;
+	int ret;
+
+        ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/acquire_release_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
+
+        FPRINTF(stderr, "Token: %u\n", token);
+
+	for(i=0; i<ntasks; i++)
+	{
+		/* synchronize data in RAM */
+                ret = starpu_data_acquire(token_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+                token ++;
+
+		ret = check_token(4*i+1);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+2);
+		if (ret == -ENODEV) goto enodev_release;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_W);
+
+		starpu_sleep(0.001);
+		starpu_data_release_to(token_handle, STARPU_R);
+
+		starpu_sleep(0.001);
+		starpu_data_release(token_handle);
+
+		ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+		ret = check_token(4*i+3);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+		ret = check_token(4*i+4);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	}
+
+	starpu_data_unregister(token_handle);
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks * 4)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	return ret;
+
+enodev_release:
+	starpu_data_release(token_handle);
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 3 - 3
tests/datawizard/bcsr.c

@@ -39,20 +39,20 @@ void cpu_show_bcsr(void *descr[], void *arg)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
-	printf("\nnnz %d elemsize %d\n", nnz, elemsize);
+	printf("\nnnz %u elemsize %u\n", nnz, elemsize);
 
 	for (i = 0; i < nrow; i++)
 	{
 		uint32_t row_start = rowptr[i] - firstentry;
 		uint32_t row_end = rowptr[i+1] - firstentry;
 
-		printf("row %d\n", i);
+		printf("row %u\n", i);
 
 		for (j = row_start; j < row_end; j++)
 		{
 			int *block = nzval + j * r*c;
 
-			printf( " column %d\n", colind[j]);
+			printf( " column %u\n", colind[j]);
 
 			for (y = 0; y < r; y++)
 			{

+ 3 - 3
tests/datawizard/interfaces/block/block_opencl.c

@@ -83,12 +83,12 @@ test_block_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-		size_t global = nx * ny * nz;
+		size_t global[3] = {nx, ny, nz};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 15 - 23
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -18,29 +18,21 @@ __kernel void block_opencl(__global int *block,
 			   int ldy, int ldz,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2);
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+
+	int val = idz*ny*nx+idy*nx+idx;
+	int i = (idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k;
-	int val = 0;
-	for (k = 0; k < nz; k++)
-	{
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					block[(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	}
+	if (block[i] != factor * val)
+		*err = 1;
+	else
+		block[i] *= -1;
 }

+ 1 - 1
tests/datawizard/interfaces/tensor/tensor_interface.c

@@ -18,7 +18,7 @@
 #include "../test_interfaces.h"
 #include "../../../helper.h"
 
-#define NX 16
+#define NX 4
 #define NY NX
 #define NZ NX
 #define NT NX

+ 3 - 3
tests/datawizard/interfaces/tensor/tensor_opencl.c

@@ -87,12 +87,12 @@ test_tensor_opencl_func(void *buffers[], void *args)
 	}
 			
 	{
-                size_t global = 1;
+		size_t global[3] = {nx, ny, nz*nt};
 		err = clEnqueueNDRangeKernel(queue,
 					     kernel,
-					     1,
+					     3,
 					     NULL,
-					     &global,
+					     global,
 					     NULL,
 					     0,
 					     NULL,

+ 18 - 26
tests/datawizard/interfaces/tensor/tensor_opencl_kernel.cl

@@ -18,32 +18,24 @@ __kernel void tensor_opencl(__global int *tensor,
 			   int ldy, int ldz, int ldt,
 			   int factor, __global int *err)
 {
-        const int id = get_global_id(0);
-	if (id > 0)
+	const int idx = get_global_id(0);
+	const int idy = get_global_id(1);
+	const int idz = get_global_id(2) % nz;
+	const int idt = get_global_id(2) / nz;
+	if (idx >= nx)
 		return;
+	if (idy >= ny)
+		return;
+	if (idz >= nz)
+		return;
+	if (idt >= nt)
+		return;
+
+	int val = idt*nz*ny*nx+idz*ny*nx+idy*nx+idx;
+	int i = (idt*ldt)+(idz*ldz)+(idy*ldy)+idx;
 
-	unsigned int i, j, k, l;
-	int val = 0;
-	for (l = 0; l < nt; l++)
-	{
-	    for (k = 0; k < nz; k++)
-	    {
-		for (j = 0; j < ny; j++)
-		{
-			for (i = 0; i < nx; i++)
-			{
-                                if (tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] != factor * val)
-				{
-					*err = 1;
-					return;
-				}
-				else
-				{
-					tensor[(l*ldt)+(k*ldz)+(j*ldy)+i] *= -1;
-					val++;
-				}
-			}
-		}
-	    }
-	}
+	if (tensor[i] != factor * val)
+		*err = 1;
+	else
+		tensor[i] *= -1;
 }

+ 8 - 0
tests/energy/energy_efficiency.c

@@ -320,11 +320,19 @@ int main(int argc, char *argv[])
 
 	unsigned N, k, m, n, iter, NITER;
 	if (argc < 2)
+#ifdef STARPU_QUICK_CHECK
+		N = 10;
+#else
 		N = 40;
+#endif
 	else
 		N = atoi(argv[1]);
 	if (argc < 3)
+#ifdef STARPU_QUICK_CHECK
+		NITER = 3;
+#else
 		NITER = 10;
+#endif
 	else
 		NITER = atoi(argv[2]);
 	if (N == 0)

+ 4 - 4
tests/helper/starpu_data_dup_ro.c

@@ -84,7 +84,7 @@ int main(int argc, char **argv)
 	ret = EXIT_SUCCESS;
 	if (*var != 42)
 	{
-	     FPRINTF(stderr, "var2 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var2 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	}
 	starpu_data_release(var2_handle);
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var3_handle);
 	if (*var != 42)
 	{
-	     FPRINTF(stderr, "var3 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var3 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	}
 	starpu_data_release(var3_handle);
@@ -102,7 +102,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var4_handle);
 	if (*var != 42)
 	{
-	     FPRINTF(stderr, "var4 is %d but it should be %d\n", *var, 42);
+	     FPRINTF(stderr, "var4 is %u but it should be %d\n", *var, 42);
 	     ret = EXIT_FAILURE;
 	}
 	starpu_data_release(var4_handle);
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 	var = starpu_data_get_local_ptr(var5_handle);
 	if (*var != 43)
 	{
-	     FPRINTF(stderr, "var5 is %d but it should be %d\n", *var, 43);
+	     FPRINTF(stderr, "var5 is %u but it should be %d\n", *var, 43);
 	     ret = EXIT_FAILURE;
 	}
 	starpu_data_release(var5_handle);

+ 343 - 0
tests/microbenchs/bandwidth.c

@@ -0,0 +1,343 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * Measure the memory bandwidth available to kernels depending on the number of
+ * kernels and number of idle workers.
+ */
+
+#ifdef STARPU_QUICK_CHECK
+static size_t size = 1024;
+static unsigned cpustep = 4;
+#else
+/* Must be bigger than available cache size per core, 64MiB should be enough */
+static size_t size = 64UL << 20;
+static unsigned cpustep = 1;
+#endif
+
+static unsigned noalone = 0;
+static unsigned iter = 30;
+static unsigned total_ncpus;
+static starpu_pthread_barrier_t barrier_begin, barrier_end;
+static float *result;
+static void **buffers;	/* Indexed by logical core number */
+static char padding1[STARPU_CACHELINE_SIZE];
+static volatile char finished;
+static char padding2[STARPU_CACHELINE_SIZE];
+
+static unsigned interleave(unsigned i);
+
+/* Initialize the buffer locally */
+void initialize_buffer(void *foo)
+{
+	unsigned id = starpu_worker_get_id();
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+	int ret = posix_memalign(&buffers[id], getpagesize(), 2*size);
+	STARPU_ASSERT(ret == 0);
+#else
+	buffers[id] = malloc(2*size);
+#endif
+	memset(buffers[id], 0, 2*size);
+}
+
+/* Actual transfer codelet */
+void bw_func(void *descr[], void *arg)
+{
+	int id = (uintptr_t) arg;
+	void *src = buffers[id];
+	void *dst = (void*) ((uintptr_t)src + size);
+	unsigned i;
+	double start, stop;
+
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	start = starpu_timing_now();
+	for (i = 0; i < iter; i++)
+	{
+		memcpy(dst, src, size);
+		STARPU_SYNCHRONIZE();
+	}
+	stop = starpu_timing_now();
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_end);
+	finished = 1;
+
+	result[id] = (size*iter) / (stop - start);
+}
+
+static struct starpu_codelet bw_codelet =
+{
+	.cpu_funcs = {bw_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while doing lots of cpu yields (nop). */
+void nop_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		unsigned i;
+		for (i = 0; i < 1000000; i++)
+			STARPU_UYIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet nop_codelet =
+{
+	.cpu_funcs = {nop_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+/* Codelet that waits for completion while aggressively reading the finished variable. */
+void sync_func(void *descr[], void *arg)
+{
+	STARPU_PTHREAD_BARRIER_WAIT(&barrier_begin);
+	while (!finished)
+	{
+		STARPU_VALGRIND_YIELD();
+		STARPU_SYNCHRONIZE();
+	}
+}
+
+static struct starpu_codelet sync_codelet =
+{
+	.cpu_funcs = {sync_func},
+	.model = NULL,
+	.nbuffers = 0,
+};
+
+static void usage(char **argv)
+{
+	fprintf(stderr, "Usage: %s [-n niter] [-s size (MB)] [-c cpustep] [-a]\n", argv[0]);
+	fprintf(stderr, "\t-n niter\tNumber of iterations\n");
+	fprintf(stderr, "\t-s size\tBuffer size in MB\n");
+	fprintf(stderr, "\t-c cpustep\tCpu number increment\n");
+	fprintf(stderr, "\t-a Do not run the alone test\n");
+	exit(EXIT_FAILURE);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int c;
+	while ((c = getopt(argc, argv, "n:s:c:ah")) != -1)
+	switch(c)
+	{
+		case 'n':
+			iter = atoi(optarg);
+			break;
+		case 's':
+			size = (long)atoi(optarg) << 20;
+			break;
+		case 'c':
+			cpustep = atoi(optarg);
+			break;
+		case 'a':
+			noalone = 1;
+			break;
+		case 'h':
+			usage(argv);
+			break;
+	}
+}
+
+static unsigned interleave(unsigned i)
+{
+	/* TODO: rather distribute over hierarchy */
+	if (total_ncpus > 1)
+		return (i % (total_ncpus/2))*2 + i / (total_ncpus/2);
+	else
+		return 0;
+}
+
+enum sleep_type {
+	PAUSE,
+	NOP,
+	SYNC,
+	SCHED,
+};
+
+static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int intl, enum sleep_type sleep)
+{
+	int ret;
+	unsigned i;
+	struct starpu_conf conf;
+	float bw;
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+	conf.ncpus = ncpus;
+
+	if (intl && sleep == PAUSE)
+	{
+		conf.use_explicit_workers_bindid = 1;
+		for (i = 0; i < ncpus; i++)
+			conf.workers_bindid[i] = interleave(i);
+	}
+
+	ret = starpu_initialize(&conf, argc, argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (sleep == PAUSE || sleep == SCHED)
+		/* In these cases we don't have a task on each cpu */
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, nbusy);
+	else
+		STARPU_PTHREAD_BARRIER_INIT(&barrier_begin, NULL, ncpus);
+
+	STARPU_PTHREAD_BARRIER_INIT(&barrier_end, NULL, nbusy);
+
+	finished = 0;
+	for (i = 0; i < ncpus; i++)
+		result[i] = NAN;
+
+	for (i = 0; i < nbusy; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &bw_codelet;
+
+		if (intl)
+			task->cl_arg = (void*) (uintptr_t) interleave(i);
+		else
+			task->cl_arg = (void*) (uintptr_t) i;
+
+		task->execute_on_a_specific_worker = 1;
+		if (intl && sleep != PAUSE) /* In the pause case we interleaved above */
+			task->workerid = interleave(i);
+		else
+			task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	if (sleep != PAUSE && sleep != SCHED)
+	{
+		/* Add waiting tasks */
+		for ( ; i < ncpus; i++)
+		{
+			struct starpu_task *task = starpu_task_create();
+			switch (sleep)
+			{
+			case NOP:
+				task->cl = &nop_codelet;
+				break;
+			case SYNC:
+				task->cl = &sync_codelet;
+				break;
+			default:
+				STARPU_ASSERT(0);
+			}
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = interleave(i);
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+
+	starpu_task_wait_for_all();
+	starpu_shutdown();
+
+	for (bw = 0., i = 0; i < nbusy; i++)
+	{
+		if (intl)
+			bw += result[interleave(i)];
+		else
+			bw += result[i];
+	}
+	return bw;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned n;
+	struct starpu_conf conf;
+	float alone, alone_int, alone_int_nop, alone_int_sync, sched, sched_int;
+
+	parse_args(argc, argv);
+
+	starpu_conf_init(&conf);
+	conf.precedence_over_environment_variables = 1;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	total_ncpus = starpu_cpu_worker_get_count();
+
+	buffers = malloc(total_ncpus * sizeof(*buffers));
+	starpu_execute_on_each_worker_ex(initialize_buffer, NULL, STARPU_CPU, "init_buffer");
+	starpu_shutdown();
+
+	if (total_ncpus == 0)
+		return STARPU_TEST_SKIPPED;
+
+	result = malloc(total_ncpus * sizeof(result[0]));
+
+	printf("# nw\ta comp.\t+sched\teff%%\ta scat.\t+nop\t+sync\t+sched\teff%% vs nop\n");
+	for (n = cpustep; n <= total_ncpus; n += cpustep)
+	{
+		if (noalone)
+		{
+			alone = 0.;
+			alone_int = 0.;
+			alone_int_nop = 0.;
+			alone_int_sync = 0.;
+		}
+		else
+		{
+			alone = bench(&argc, &argv, n, n, 0, PAUSE);
+			alone_int = bench(&argc, &argv, n, n, 1, PAUSE);
+			alone_int_nop = bench(&argc, &argv, n, total_ncpus, 1, NOP);
+			alone_int_sync = bench(&argc, &argv, n, total_ncpus, 1, SYNC);
+		}
+		sched = bench(&argc, &argv, n, total_ncpus, 0, SCHED);
+		sched_int = bench(&argc, &argv, n, total_ncpus, 1, SCHED);
+		printf("%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n",
+				n,
+				alone/1000,
+				sched/1000, sched*100/alone,
+				alone_int/1000,
+				alone_int_nop/1000,
+				alone_int_sync/1000,
+				sched_int/1000, sched_int*100/alone_int_nop);
+		fflush(stdout);
+	}
+
+	free(result);
+
+	for (n = 0; n < total_ncpus; n++)
+		free(buffers[n]);
+	free(buffers);
+
+	return EXIT_SUCCESS;
+}

+ 75 - 0
tests/microbenchs/bandwidth_scheds.sh

@@ -0,0 +1,75 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -e
+
+if [ -n "$STARPU_SCHED" ]
+then
+	SCHEDS=$STARPU_SCHED
+	DEFAULT=$STARPU_SCHED
+else
+	SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
+	DEFAULT=eager
+fi
+
+if [ -n "$STARPU_BENCH_DIR" ]; then
+	cat > bandwidth.gp << EOF
+set term svg font ",12" size 1500,500 linewidth 0.5
+set output "bandwidth.svg"
+set pointsize 0.3
+EOF
+else
+	fast="-n 3 -c 4"
+	cat > bandwidth.gp << EOF
+set term postscript eps enhanced color font ",18"
+set output "bandwidth.eps"
+set size 2,1
+EOF
+fi
+
+cat >> bandwidth.gp << EOF
+set key outside
+set ylabel "GB/s"
+set xlabel "ncores"
+
+plot \\
+	"bandwidth-$DEFAULT.dat" using 1:5 with lines title "alone interleave", \\
+	"bandwidth-$DEFAULT.dat" using 1:6 with lines title "nop", \\
+	"bandwidth-$DEFAULT.dat" using 1:7 with lines title "sync", \\
+	"bandwidth-$DEFAULT.dat" using 1:2 with lines title "alone contiguous", \\
+EOF
+
+type=1
+for sched in $SCHEDS
+do
+	if [ "$sched" != eager -a "$sched" != "$SCHEDS" ]; then
+		extra=-a
+	else
+		extra=
+	fi
+
+	STARPU_BACKOFF_MIN=0 STARPU_BACKOFF_MAX=0 STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/bandwidth $fast $extra "$@" | tee bandwidth-$sched.dat
+	echo "\"bandwidth-$sched.dat\" using 1:3 with linespoints lt $type pt $type title \"$sched\", \\" >> bandwidth.gp
+	echo "\"bandwidth-$sched.dat\" using 1:8 with linespoints lt $type pt $type notitle, \\" >> bandwidth.gp
+	type=$((type+1))
+done
+
+if gnuplot bandwidth.gp ; then
+	if [ -n "$STARPU_BENCH_DIR" ]; then
+		cp bandwidth.svg $STARPU_BENCH_DIR/
+	fi
+fi

+ 1 - 0
tests/overlap/overlap.c

@@ -135,6 +135,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 1 - 1
tests/parallel_tasks/explicit_combined_worker.c

@@ -114,7 +114,7 @@ int main(void)
 enodev:
 	starpu_data_unregister(v_handle);
 	starpu_free(v);
-	fprintf(stderr, "WARNING: No one can execute the task on workerid %d\n", worker);
+	fprintf(stderr, "WARNING: No one can execute the task on workerid %u\n", worker);
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
 	starpu_shutdown();

+ 1 - 0
tools/Makefile.am

@@ -285,6 +285,7 @@ EXTRA_DIST =				\
 	dev/cppcheck/suppressions.txt	\
 	dev/valgrind/bash.suppr		\
 	dev/valgrind/fxt.suppr		\
+	dev/valgrind/glpk.suppr		\
 	dev/valgrind/hdf5.suppr		\
 	dev/valgrind/hwloc.suppr	\
 	dev/valgrind/libc.suppr		\

+ 23 - 0
tools/dev/valgrind/glpk.suppr

@@ -0,0 +1,23 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   ...
+   fun:glp_init_env
+}

+ 1 - 2
tools/dev/valgrind/libc.suppr

@@ -263,8 +263,7 @@
    Memcheck:Leak
    match-leak-kinds: reachable
    fun:malloc
-   fun:_dl_close_worker
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_catch_exception
    fun:_dl_catch_error

+ 1 - 1
tools/dev/valgrind/padico.suppr

@@ -110,7 +110,7 @@
    Memcheck:Leak
    match-leak-kinds: reachable
    fun:malloc
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_catch_error
    fun:dlerror_run

+ 95 - 65
tools/gdbinit

@@ -39,33 +39,6 @@ define starpu-print-task
   set $task = (struct starpu_task *)$arg0
   set $job = (struct _starpu_job *)$task->starpu_private
   set $status=0
-  if $task->status == 0
-    set $status="STARPU_TASK_INIT"
-  end
-  if $task->status == 1
-    set $status="STARPU_TASK_BLOCKED"
-  end
-  if $task->status == 2
-    set $status="STARPU_TASK_READY"
-  end
-  if $task->status == 3
-    set $status="STARPU_TASK_RUNNING"
-  end
-  if $task->status == 4
-    set $status="STARPU_TASK_FINISHED"
-  end
-  if $task->status == 5
-    set $status="STARPU_TASK_BLOCKED_ON_TAG"
-  end
-  if $task->status == 6
-    set $status="STARPU_TASK_BLOCKED_ON_TASK"
-  end
-  if $task->status == 7
-    set $status="STARPU_TASK_BLOCKED_ON_DATA"
-  end
-  if $task->status == 8
-    set $status="STARPU_TASK_STOPPED"
-  end
 
   printf "StarPU Task (%p)\n", $task
   if $task->name
@@ -81,13 +54,42 @@ define starpu-print-task
   end
   printf "\tnbuffers:\t\t\t<%d>\n", $nbuffers
   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func
+  printf "\tcl_arg:\t\t\t<%p>\n", $task->cl_arg
   printf "\tsynchronous:\t\t\t<%d>\n", $task->synchronous
   printf "\texecute_on_a_specific_worker:\t<%d>\n", $task->execute_on_a_specific_worker
   printf "\tworkerid:\t\t\t<%d>\n", $task->workerid
   printf "\tdetach:\t\t\t\t<%d>\n", $task->detach
   printf "\tdestroy:\t\t\t<%d>\n", $task->destroy
   printf "\tregenerate:\t\t\t<%d>\n", $task->regenerate
-  printf "\tstatus:\t\t\t\t<%s>\n", $status
+  printf "\tstatus:\t\t\t\t"
+  if $task->status == 0
+    printf "STARPU_TASK_INIT"
+  end
+  if $task->status == 1
+    printf "STARPU_TASK_BLOCKED"
+  end
+  if $task->status == 2
+    printf "STARPU_TASK_READY"
+  end
+  if $task->status == 3
+    printf "STARPU_TASK_RUNNING"
+  end
+  if $task->status == 4
+    printf "STARPU_TASK_FINISHED"
+  end
+  if $task->status == 5
+    printf "STARPU_TASK_BLOCKED_ON_TAG"
+  end
+  if $task->status == 6
+    printf "STARPU_TASK_BLOCKED_ON_TASK"
+  end
+  if $task->status == 7
+    printf "STARPU_TASK_BLOCKED_ON_DATA"
+  end
+  if $task->status == 8
+    printf "STARPU_TASK_STOPPED"
+  end
+  printf "\n"
   printf "\tjob:\t\t\t\t<%p>\n", $job
   printf "\ttag_id:\t\t\t\t<%d>\n", $task->tag_id
   printf "\tndeps:\t\t\t\t<%u>\n", $job->job_successors->ndeps
@@ -169,35 +171,36 @@ define starpu-workers
   printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized Status\n"
   while $num<_starpu_config->topology->nworkers
     set $worker=&_starpu_config->workers[$num]
+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d ", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
     if $worker->status == STATUS_INVALID
-      set $status="INVALID"
+      printf "INVALID"
     end
     if $worker->status == STATUS_UNKNOWN
-      set $status="UNKNOWN"
+      printf "UNKNOWN"
     end
     if $worker->status == STATUS_INITIALIZING
-      set $status="INITIALIZING"
+      printf "INITIALIZING"
     end
     if $worker->status == STATUS_EXECUTING
-      set $status="EXECUTING"
+      printf "EXECUTING"
     end
     if $worker->status == STATUS_CALLBACK
-      set $status="CALLBACK"
+      printf "CALLBACK"
     end
     if $worker->status == STATUS_SCHEDULING
-      set $status="SCHEDULING"
+      printf "SCHEDULING"
     end
     if $worker->status == STATUS_WAITING
-      set $status="WAITING"
+      printf "WAITING"
     end
     if $worker->status == STATUS_SLEEPING_SCHEDULING
-      set $status="SLEEPING_SCHEDULING"
+      printf "SLEEPING_SCHEDULING"
     end
     if $worker->status == STATUS_SLEEPING
-      set $status="SLEEPING"
+      printf "SLEEPING"
     end
-    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d %s\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
-          $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized, $status
+    printf "\n"
     set $num = $num + 1
   end
 end
@@ -205,23 +208,24 @@ end
 define starpu-print-tag
   set language c
   set $tag_struct = (struct _starpu_tag *)_gettag_struct($arg0)
+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\tstate "
   if $tag_struct->state == STARPU_INVALID_STATE
-     set $status="STARPU_INVALID_STATE"
+     printf "STARPU_INVALID_STATE"
   end
   if $tag_struct->state == STARPU_ASSOCIATED
-     set $status="STARPU_ASSOCIATED"
+     printf "STARPU_ASSOCIATED"
   end
   if $tag_struct->state == STARPU_BLOCKED
-     set $status="STARPU_BLOCKED"
+     printf "STARPU_BLOCKED"
   end
   if $tag_struct->state == STARPU_READY
-     set $status="STARPU_READY"
+     printf "STARPU_READY"
   end
   if $tag_struct->state == STARPU_DONE
-     set $status="STARPU_DONE"
+     printf "STARPU_DONE"
   end
-  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
-  printf "\tstate %s\n", $status
+  printf "\n"
   printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
   printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
   printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
@@ -317,7 +321,11 @@ define starpu-all-tasks
     while $l != &all_jobs_list
       set $j = (struct _starpu_job*) (((unsigned long) $l) - ((unsigned long) &((struct _starpu_job *)0)->all_submitted))
       set $task = $j->task
-      printf "task %p %s\n", $task, $task->name ? $task->name : ""
+      if $task->name
+        printf "task %p %s\n", $task, $task->name
+      else
+        printf "task %p\n", $task
+      end
       set $l = $l->next
     end
   end
@@ -915,9 +923,9 @@ end
 define starpu-sched-print-component
     set $c = (struct starpu_sched_component *) $arg1
     starpu-print-spaces $arg0
-    printf "%s %s %s (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? "homogeneous":"heterogeneous", $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? "single-node" : "multi-node", $c
+    printf "%s %c %c (struct starpu_sched_component *) %p\n", $c->name, $c->properties & STARPU_SCHED_COMPONENT_HOMOGENEOUS ? 'o':'e', $c->properties & STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE ? 's' : 'm', $c
     if $c->push_task == fifo_push_task
-      set $f = ((struct _starpu_fifo_data *) $c->data)->fifo
+      set $f = &((struct _starpu_fifo_data *) $c->data)->fifo
       starpu-print-spaces $arg0
       printf "%d tasks start %f len %f end %f processed %d\n", $f->ntasks, $f->exp_start, $f->exp_len, $f->exp_end, $f->nprocessed
     end
@@ -951,29 +959,29 @@ end
 
 define starpu-mpi-print-request
     set $request = (struct _starpu_mpi_req *)$arg0
-    set $request_type = "unknown_type"
+    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type ", $request, $request->data_handle, $request->data_handle && $request->data_handle->mpi_data ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.node.rank,
     if $request->request_type == SEND_REQ
-       set $request_type = "SEND_REQ"
+       printf "SEND_REQ"
     end
     if $request->request_type == RECV_REQ
-       set $request_type = "RECV_REQ"
+       printf "RECV_REQ"
     end
     if $request->request_type == WAIT_REQ
-       set $request_type = "WAIT_REQ"
+       printf "WAIT_REQ"
     end
     if $request->request_type == TEST_REQ
-       set $request_type = "TEST_REQ"
+       printf "TEST_REQ"
     end
     if $request->request_type == BARRIER_REQ
-       set $request_type = "BARRIER_REQ"
+       printf "BARRIER_REQ"
     end
     if $request->request_type == PROBE_REQ
-       set $request_type = "PROBE_REQ"
+       printf "PROBE_REQ"
     end
     if $request->request_type == UNKNOWN_REQ
-       set $request_type = "UNKNOWN_REQ"
+       printf "UNKNOWN_REQ"
     end
-    printf "Request (struct _starpu_mpi_req *) %p data %p tag %d to MPI node %d type %s submitted %d completed %d posted %d detached %d is_internal_req %d\n", $request, $request->data_handle, $request->data_handle ? ((struct _starpu_mpi_node_tag *) ($request->data_handle->mpi_data))->data_tag : -1, $request->node_tag.rank, $request_type, $request->submitted, $request->completed, $request->posted, $request->detached, $request->is_internal_req
+    printf " submitted %d completed %d posted %d detached %d\n", $request->submitted, $request->completed, $request->posted, $request->detached
 end
 
 define starpu-mpi-print-ready-recv-requests
@@ -989,17 +997,39 @@ define starpu-mpi-print-ready-recv-requests
     end
 end
 
+define starpu-mpi-print-requests-list
+  set $list = $arg0
+  set $request = $list->_head
+  while $request
+    starpu-mpi-print-request $request
+    set $request = $request->_next
+  end
+end
+
+define starpu-mpi-print-requests-tree
+  if $arg0
+    starpu-mpi-print-requests-tree $arg0->children[0]
+    set $stage = (struct _starpu_mpi_req_prio_list_stage *) $arg0
+    starpu-mpi-print-requests-list (&($stage->list))
+    starpu-mpi-print-requests-tree $arg0->children[1]
+  end
+end
+
 define starpu-mpi-print-ready-send-requests
-    set $list = (struct _starpu_mpi_req_prio_list) ready_send_requests
-    if $list
-	set $request = $list.list._head
-        while $request
-            starpu-mpi-print-request $request
-	    set $request = $request->_next
-	end
+  set $prio_list = (struct _starpu_mpi_req_prio_list) ready_send_requests
+  if _starpu_debug
+    if $prio_list
+        starpu-mpi-print-requests-list &$prio_list.list
+    else
+	printf "No ready send requests\n"
+    end
+  else
+    if $prio_list.empty == 0
+        starpu-mpi-print-requests-tree $prio_list.tree.root
     else
 	printf "No ready send requests\n"
     end
+  end
 end
 
 define starpu-mpi-print-detached-requests

+ 2 - 2
tools/starpu_fxt_tool.c

@@ -81,7 +81,7 @@ static int parse_args(int argc, char **argv)
 		{
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			{
-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
 				return 7;
 			}
 			options.filenames[options.ninputfiles++] = argv[++i];
@@ -179,7 +179,7 @@ static int parse_args(int argc, char **argv)
 		{
 			if (options.ninputfiles >= STARPU_FXT_MAX_FILES)
 			{
-				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%u)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
+				fprintf(stderr, "Error: The number of trace files is superior to STARPU_FXT_MAX_FILES (%d)\nPlease recompile StarPU with a bigger --enable-fxt-max-files\n", STARPU_FXT_MAX_FILES);
 				return 7;
 			}
 			options.filenames[options.ninputfiles++] = argv[i];

+ 0 - 0
tools/starpu_perfmodel_recdump.c


部分文件因文件數量過多而無法顯示