Selaa lähdekoodia

Merge branch 'master' into fpga

Nathalie Furmento 5 vuotta sitten
vanhempi
commit
1bef44b8f4
100 muutettua tiedostoa jossa 1312 lisäystä ja 593 poistoa
  1. 2 0
      ChangeLog
  2. 30 17
      configure.ac
  3. 87 2
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  4. 3 2
      doc/doxygen/chapters/470_simgrid.doxy
  5. 22 1
      doc/doxygen/chapters/api/threads.doxy
  6. BIN
      doc/doxygen/chapters/images/starvz_visu.eps
  7. BIN
      doc/doxygen/chapters/images/starvz_visu.png
  8. 1 2
      doc/doxygen_dev/Makefile.am
  9. 2 2
      examples/cholesky/cholesky.sh
  10. 9 9
      examples/heat/heat.sh
  11. 11 11
      examples/lu/lu.sh
  12. 19 3
      examples/mult/xgemm.c
  13. 2 2
      examples/scheduler/schedulers.sh
  14. 2 2
      examples/scheduler/schedulers_context.sh
  15. 5 1
      include/starpu_config.h.in
  16. 8 2
      include/starpu_task.h
  17. 10 4
      include/starpu_thread.h
  18. 53 5
      include/starpu_thread_util.h
  19. 21 1
      m4/libs.m4
  20. 1 5
      mpi/examples/Makefile.am
  21. 3 3
      mpi/examples/perf.sh
  22. 2 0
      mpi/examples/user_datatype/my_interface.h
  23. 70 34
      mpi/src/mpi/starpu_mpi_mpi.c
  24. 8 8
      mpi/src/mpi/starpu_mpi_mpi_backend.c
  25. 8 3
      mpi/src/nmad/starpu_mpi_nmad.c
  26. 4 4
      mpi/src/starpu_mpi.c
  27. 17 17
      mpi/src/starpu_mpi_req.c
  28. 23 23
      mpi/tests/Makefile.am
  29. 4 1
      mpi/tests/driver.c
  30. 4 1
      mpi/tests/mpi_earlyrecv.c
  31. 2 2
      mpi/tests/mpi_earlyrecv2.c
  32. 2 2
      mpi/tests/mpi_earlyrecv2_sync.c
  33. 4 1
      mpi/tests/mpi_test.c
  34. 4 1
      mpi/tests/multiple_send.c
  35. 4 4
      mpi/tests/pingpong.c
  36. 10 1
      mpi/tests/sendrecv_bench.c
  37. 2 1
      src/common/fxt.c
  38. 2 2
      src/common/graph.c
  39. 7 2
      src/common/list.h
  40. 14 4
      src/common/prio_list.h
  41. 21 0
      src/common/rbtree.h
  42. 6 3
      src/common/thread.c
  43. 10 10
      src/core/dependencies/cg.c
  44. 2 2
      src/core/dependencies/cg.h
  45. 44 16
      src/core/dependencies/implicit_data_deps.c
  46. 2 2
      src/core/dependencies/implicit_data_deps.h
  47. 6 6
      src/core/dependencies/tags.c
  48. 5 7
      src/core/jobs.c
  49. 8 8
      src/core/perfmodel/perfmodel_history.c
  50. 36 9
      src/core/simgrid.c
  51. 15 2
      src/core/simgrid.h
  52. 28 9
      src/core/simgrid_cpp.cpp
  53. 27 11
      src/core/task.c
  54. 5 5
      src/core/task_bundle.c
  55. 11 5
      src/core/topology.c
  56. 2 2
      src/core/workers.h
  57. 49 38
      src/datawizard/filters.c
  58. 2 1
      src/datawizard/interfaces/bcsr_filters.c
  59. 53 51
      src/datawizard/interfaces/data_interface.c
  60. 6 6
      src/datawizard/memstats.c
  61. 33 15
      src/datawizard/user_interactions.c
  62. 7 7
      src/drivers/mpi/driver_mpi_source.h
  63. 2 2
      src/profiling/bound.c
  64. 2 2
      src/util/openmp_runtime_support.c
  65. 11 1
      tests/Makefile.am
  66. 2 2
      tests/cholesky/sched.sh
  67. 3 3
      tests/cholesky/sched_one_gpu.sh
  68. 2 1
      tests/cholesky_ctxs/evaluate_expression.sh
  69. 24 24
      tests/coverage/coverage.sh
  70. 51 4
      tests/datawizard/acquire_cb.c
  71. 1 1
      tests/datawizard/interfaces/test_interfaces.sh
  72. 3 3
      tests/datawizard/locality.sh
  73. 5 5
      tests/experiments/bandwidth_cuda/bench_bandwidth.sh
  74. 2 2
      tests/heat/deps.sh
  75. 4 4
      tests/heat/dmda.sh
  76. 2 2
      tests/heat/gflops.sh
  77. 2 2
      tests/heat/gflops_sched.sh
  78. 9 9
      tests/heat/granularity.sh
  79. 5 5
      tests/heat/heat.sh
  80. 2 2
      tests/heat/model_perturbation.sh
  81. 3 3
      tests/heat/sched.sh
  82. 2 2
      tests/heat/speedup.sh
  83. 2 2
      tests/incrementer/speed.sh
  84. 51 13
      tests/loader.c
  85. 2 2
      tests/main/combined_workers/bfs/run.sh
  86. 2 2
      tests/memory/memstress.sh
  87. 2 2
      tests/memory/memstress2.sh
  88. 19 0
      tests/microbenchs/async_tasks_data_overhead.sh
  89. 53 25
      tests/microbenchs/async_tasks_overhead.c
  90. 2 2
      tests/microbenchs/microbench.sh
  91. 2 2
      tests/microbenchs/starpu_check.sh
  92. 19 0
      tests/microbenchs/sync_tasks_data_overhead.sh
  93. 70 15
      tests/microbenchs/sync_tasks_overhead.c
  94. 19 0
      tests/microbenchs/tasks_data_overhead.sh
  95. 63 42
      tests/microbenchs/tasks_overhead.c
  96. 2 2
      tests/microbenchs/tasks_size_overhead.sh
  97. 2 2
      tests/microbenchs/tasks_size_overhead_sched.sh
  98. 3 1
      tests/microbenchs/tasks_size_overhead_scheds.sh
  99. 2 2
      tests/mult/gflops.sh
  100. 0 0
      tests/mult/sched.sh

+ 2 - 0
ChangeLog

@@ -43,6 +43,8 @@ Small features:
   * Move optimized cuda 2d copy from interfaces to new
     starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
     them from starpu_interface_copy2d and 3d.
+  * New function starpu_task_watchdog_set_hook to specify a function
+    to be called when the watchdog is raised
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 30 - 17
configure.ac

@@ -273,34 +273,38 @@ if test x$enable_simgrid = xyes ; then
 		]
 	)
 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
+	AC_CHECK_HEADERS([msg/msg.h], [AC_DEFINE([STARPU_HAVE_MSG_MSG_H], [1], [Define to 1 if you have msg.h in msg/.])])
 	AC_CHECK_HEADERS([simgrid/host.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_HOST_H], [1], [Define to 1 if you have host.h in simgrid/.])])
+	AC_CHECK_HEADERS([xbt/base.h], [AC_DEFINE([STARPU_HAVE_XBT_BASE_H], [1], [Define to 1 if you have base.h in xbt/.])])
+	AC_CHECK_HEADERS([simgrid/version.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_VERSION_H], [1], [Define to 1 if you have version.h in simgrid/.])], [], [[
+			  #ifdef STARPU_HAVE_XBT_BASE_H
+			  #include <xbt/base.h>
+			  #endif
+			  ]])
 	AC_CHECK_HEADERS([simgrid/simdag.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SIMDAG_H], [1], [Define to 1 if you have simdag.h in simgrid/.])])
 	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
+	AC_CHECK_HEADERS([xbt/config.h], [AC_DEFINE([STARPU_HAVE_XBT_CONFIG_H], [1], [Define to 1 if you have config.h in xbt/.])])
+	AC_CHECK_HEADERS([simgrid/actor.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_ACTOR_H], [1], [Define to 1 if you have actor.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/engine.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_ENGINE_H], [1], [Define to 1 if you have engine.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/semaphore.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SEMAPHORE_H], [1], [Define to 1 if you have semaphore.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/mutex.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MUTEX_H], [1], [Define to 1 if you have mutex.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/cond.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_COND_H], [1], [Define to 1 if you have cond.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/barrier.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_BARRIER_H], [1], [Define to 1 if you have barrier.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/engine.h])
+	AC_CHECK_HEADERS([simgrid/zone.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_ZONE_H], [1], [Define to 1 if you have zone.h in simgrid/.])])
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 	# Latest functions
-	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
-	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
+	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init MSG_zone_get_hosts sg_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
+	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
+	AC_CHECK_FUNCS([simgrid_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMGRID_INIT], [1], [Define to 1 if you have the `simgrid_init' function.])])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
+	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_host_sendto sg_cfg_set_int sg_actor_self_execute simgrid_get_clock])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 
 	# Oldies for compatibility with older simgrid
 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
 
-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
-		    		[[
-#ifdef STARPU_HAVE_SIMGRID_MSG_H
-#include <simgrid/msg.h>
-#else
-#include <msg/msg.h>
-#endif
-				 ]],
-				[[msg_host_t foo; ]]
-			    )],
-	                 [],
-	                 [
-			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
-		         ])
 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
 	# We won't bind or detect anything
 	with_hwloc=no
@@ -727,7 +731,7 @@ fi
 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
     if test x$enable_simgrid = xyes ; then
         if test x$enable_shared = xyes ; then
-	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
+	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this, or disable MPI with --disable-mpi])
         else
 	    CFLAGS="$CFLAGS -fPIC"
 	    CXXFLAGS="$CXXFLAGS -fPIC"
@@ -920,6 +924,12 @@ if test x$have_pthread_setname_np = xyes; then
 	AC_DEFINE(STARPU_HAVE_PTHREAD_SETNAME_NP,[1],[pthread_setname_np is available])
 fi
 
+if test "x$cross_compiling" = "xno"; then
+	STARPU_INIT_ZERO([[#include <pthread.h>]], pthread_mutex_t, PTHREAD_MUTEX_INITIALIZER)
+	STARPU_INIT_ZERO([[#include <pthread.h>]], pthread_cond_t, PTHREAD_COND_INITIALIZER)
+	STARPU_INIT_ZERO([[#include <pthread.h>]], pthread_rwlock_t, PTHREAD_RWLOCK_INITIALIZER)
+fi
+
 # There is no posix_memalign on Mac OS X, only memalign
 AC_CHECK_FUNCS([posix_memalign], [AC_DEFINE([STARPU_HAVE_POSIX_MEMALIGN], [1], [Define to 1 if you have the `posix_memalign' function.])])
 AC_CHECK_FUNCS([memalign], [AC_DEFINE([STARPU_HAVE_MEMALIGN], [1], [Define to 1 if you have the `memalign' function.])])
@@ -3564,6 +3574,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x doc/doxygen/doxygen_filter.sh
   chmod +x doc/doxygen_dev/doxygen_filter.sh
   mkdir -p tests/microbenchs
+  test -e tests/microbenchs/tasks_data_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_data_overhead.sh tests/microbenchs/
+  test -e tests/microbenchs/sync_tasks_data_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/sync_tasks_data_overhead.sh tests/microbenchs/
+  test -e tests/microbenchs/async_tasks_data_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/async_tasks_data_overhead.sh tests/microbenchs/
   test -e tests/microbenchs/tasks_size_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead.sh tests/microbenchs/
   test -e tests/microbenchs/tasks_size_overhead_sched.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead_sched.sh tests/microbenchs/
   test -e tests/microbenchs/tasks_size_overhead_scheds.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead_scheds.sh tests/microbenchs/

+ 87 - 2
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -572,15 +572,37 @@ $ starpu_paje_sort paje.trace
 
 \section PapiCounters PAPI counters
 
-Performance counter values can be obtained from the PAPI framework if
+Performance counter values could be obtained from the PAPI framework if
 <c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
-environment variable to 1 and then specify which counters to record with the
+environment variable to 1 and then specify which events to record with the
 \ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
 
 \verbatim
 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
 \endverbatim
 
+In the current simple implementation, only CPU tasks have their events measured
+and require CPUs that support the PAPI events. All events that PAPI support are
+available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
+It is important to note that not all events are available on all systems, and
+general PAPI recommendations should be followed.
+
+The counter values can be accessed using the profiling interface:
+\code{.c}
+task->profiling_info->papi_values
+\endcode
+Also, it can be accessed and/or saved with tracing when using \ref STARPU_FXT_TRACE. With the use of <c>starpu_fxt_tool</c>
+the file <c>papi.rec</c> is generated containing the following triple:
+
+\verbatim
+Task Id
+Event Id
+Value
+\endverbatim
+
+External tools like <c>rec2csv</c> can be used to convert this rec file to a <c>csv</c>, where each
+line represents a value for an event for a task.
+
 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
 
 StarPU can record a trace of what tasks are needed to complete the
@@ -645,6 +667,69 @@ the priorities as the StarPU scheduler would, i.e. schedule prioritized
 tasks before less prioritized tasks, to check to which extend this results
 to a less optimal solution. This increases even more computation time.
 
+\section starvz Trace visualization with StarVZ
+
+Creating views with StarVZ (see: https://github.com/schnorr/starvz) is made up of two steps. The initial
+stage consists of a pre-processing of the traces generated by the application.
+The second step consists of the analysis itself and is carried out with the
+aid of R packages. To download and install StarVZ, it is necessary to have R,
+pajeng and the following packages:
+
+\verbatim
+# For pajeng
+apt install -y git cmake build-essential libboost-dev asciidoc flex bison
+git clone git://github.com/schnorr/pajeng.git
+mkdir -p pajeng/b ; cd pajeng/b
+cmake ..
+make
+
+# For R tidyverse
+apt install -y r-base libxml2-dev libssl-dev libcurl4-openssl-dev libgit2-dev libboost-dev
+\endverbatim
+
+To install the StarVZ the following commands can be used:
+
+\verbatim
+git clone https://github.com/schnorr/starvz.git
+echo "install.packages(c('tidyverse', 'devtools'), repos = 'https://cloud.r-project.org')" | R --vanilla
+echo "library(devtools); devtools::install_local(path='./starvz/R_package')" | R --vanilla
+\endverbatim
+
+To generate traces from an application, it is necessary to set \ref STARPU_GENERATE_TRACE.
+and build StarPU with FxT. Then, Step 1 of StarVZ can be used on a folder with
+StarPU FxT traces:
+
+\verbatim
+export PATH=starvz/:$PATH
+export PATH=pajeng/b:$PATH
+export PATH=$STARPU_HOME/bin:$PATH
+
+./starvz/src/phase1-workflow.sh /tmp/ ""
+\endverbatim
+
+Then the second step can be executed directly in R, StarVZ enables a set of
+different plots that can be configured on a .yaml file. A default file is provided
+<c>full_config.yaml</c>; also the options can be changed directly in R.
+
+\verbatim
+library(starvz)
+dtrace <- the_fast_reader_function("./")
+
+pajer <- config::get(file = "starvz/full_config.yaml")
+
+pajer$starpu$active = TRUE
+pajer$submitted$active = TRUE
+pajer$st$abe$active = TRUE
+
+plot <- the_master_function(dtrace)
+\endverbatim
+
+An example of visualization follows:
+
+\image html starvz_visu.png
+\image latex starvz_visu.eps "" width=\textwidth
+
+
 \section MemoryFeedback Memory Feedback
 
 It is possible to enable memory statistics. To do so, you need to pass

+ 3 - 2
doc/doxygen/chapters/470_simgrid.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2011,2014-2019                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,7 +23,8 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.24.
+platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to
+3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON .
 Other versions may have compatibility issues. 3.17 notably does not build at
 all. MPI simulation does not work with version 3.22.
 

+ 22 - 1
doc/doxygen/chapters/api/threads.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017, 2019                          CNRS
- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2016,2020                 Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -35,6 +35,13 @@ Call starpu_pthread_create() and abort on error.
 \ingroup API_Threads
 Call starpu_pthread_mutex_init() and abort on error.
 
+\def STARPU_PTHREAD_MUTEX_INIT0
+\ingroup API_Threads
+Call starpu_pthread_mutex_init() only if the content of
+PTHREAD_MUTEX_INITIALIZER is not zero. This should be called instead
+of STARPU_PTHREAD_MUTEX_INIT when it is known that the content of the
+pthread_mutex_t was already zeroed.
+
 \def STARPU_PTHREAD_MUTEX_DESTROY
 \ingroup API_Threads
 Call starpu_pthread_mutex_destroy() and abort on error.
@@ -67,6 +74,13 @@ Call starpu_pthread_getspecific() and abort on error.
 \ingroup API_Threads
 Call starpu_pthread_rwlock_init() and abort on error.
 
+\def STARPU_PTHREAD_RWLOCK_INIT0
+\ingroup API_Threads
+Call starpu_pthread_rwlock_init() only if the content of
+PTHREAD_RWLOCK_INITIALIZER is not zero. This should be called instead
+of STARPU_PTHREAD_RWLOCK_INIT when it is known that the content of the
+pthread_rwlock_t was already zeroed.
+
 \def STARPU_PTHREAD_RWLOCK_RDLOCK
 \ingroup API_Threads
 Call starpu_pthread_rwlock_rdlock() and abort on error.
@@ -87,6 +101,13 @@ Call starpu_pthread_rwlock_destroy() and abort on error.
 \ingroup API_Threads
 Call starpu_pthread_cond_init() and abort on error.
 
+\def STARPU_PTHREAD_COND_INIT0
+\ingroup API_Threads
+Call starpu_pthread_cond_init() only if the content of
+PTHREAD_COND_INITIALIZER is not zero. This should be called instead
+of STARPU_PTHREAD_COND_INIT when it is known that the content of the
+pthread_cond_t was already zeroed.
+
 \def STARPU_PTHREAD_COND_DESTROY
 \ingroup API_Threads
 Call starpu_pthread_cond_destroy() and abort on error.

BIN
doc/doxygen/chapters/images/starvz_visu.eps


BIN
doc/doxygen/chapters/images/starvz_visu.png


+ 1 - 2
doc/doxygen_dev/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010-2018                                CNRS
+# Copyright (C) 2010-2018, 2020                          CNRS
 # Copyright (C) 2013-2018                                Inria
 # Copyright (C) 2009,2011,2013,2014,2017                 Université de Bordeaux
 #
@@ -126,7 +126,6 @@ $(DOX_TAG): $(dox_inputs)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen_dev/chapters/version.sty $(DOX_LATEX_DIR)
-	@-cp $(top_srcdir)/doc/doxygen_dev/chapters/images/*pdf $(DOX_LATEX_DIR)
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
 	@cd $(DOX_LATEX_DIR) ;\
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\

+ 2 - 2
examples/cholesky/cholesky.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2018-2019                                Université de Bordeaux
+# Copyright (C) 2018-2020                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +34,7 @@ for size in `seq 2 2 30` ; do
 	for STARPU_SCHED in $STARPU_SCHEDS
 	do
 		export STARPU_SCHED
-		GFLOPS=`${ROOT}_implicit -size $((size * 960)) -nblocks $size 2> /dev/null | grep -v GFlops | cut -d '	' -f 3`
+		GFLOPS=`$STARPU_LAUNCH ${ROOT}_implicit -size $((size * 960)) -nblocks $size 2> /dev/null | grep -v GFlops | cut -d '	' -f 3`
 		[ -n "$GFLOPS" ] || GFLOPS='""'
 		echo -n "	$GFLOPS"
 	done

+ 9 - 9
examples/heat/heat.sh

@@ -3,7 +3,7 @@
 #
 # Copyright (C) 2017                                     CNRS
 # Copyright (C) 2017                                     Inria
-# Copyright (C) 2017                                     Université de Bordeaux
+# Copyright (C) 2017, 2020                                     Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -28,23 +28,23 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat
 fi
 
-$PREFIX/heat -shape 0
-$PREFIX/heat -shape 1
+$STARPU_LAUNCH $PREFIX/heat -shape 0
+$STARPU_LAUNCH $PREFIX/heat -shape 1
 # sometimes lead to pivot being 0
-#$PREFIX/heat -shape 2
+#$STARPU_LAUNCH $PREFIX/heat -shape 2
 
-$PREFIX/heat -cg
+$STARPU_LAUNCH $PREFIX/heat -cg
 
 # TODO: FIXME
 
 # segfault
-#$PREFIX/heat -v1
+#$STARPU_LAUNCH $PREFIX/heat -v1
 
 # (actually the default...)
-$PREFIX/heat -v2
+$STARPU_LAUNCH $PREFIX/heat -v2
 
 # hang
-#$PREFIX/heat -v3
+#$STARPU_LAUNCH $PREFIX/heat -v3
 
 # hang
-#$PREFIX/heat -v4
+#$STARPU_LAUNCH $PREFIX/heat -v4

+ 11 - 11
examples/lu/lu.sh

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2017                                     CNRS
-# Copyright (C) 2017,2019                                Université de Bordeaux
+# Copyright (C) 2017,2019-2020                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -27,11 +27,11 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float
 fi
 
-$PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
-$PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -no-stride
-$PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -bound
-$PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -no-stride
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -bound
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_example_float
@@ -39,8 +39,8 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float
 fi
 
-$PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
-$PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
-$PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound
-$PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio

+ 19 - 3
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2017, 2019                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2019-2020                      Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2017                                     Erwan Leria
  * Copyright (C) 2010                                     Mehdi Juhoor
@@ -29,6 +29,7 @@
 
 #include <limits.h>
 #include <string.h>
+#include <unistd.h>
 #include <math.h>
 #include <sys/types.h>
 #include <starpu.h>
@@ -58,6 +59,7 @@ static unsigned zdim = 960*4;
 #endif
 static unsigned check = 0;
 static unsigned bound = 0;
+static unsigned print_hostname = 0;
 
 static TYPE *A, *B, *C;
 static starpu_data_handle_t A_handle, B_handle, C_handle;
@@ -304,6 +306,11 @@ static void parse_args(int argc, char **argv)
 			bound = 1;
 		}
 
+		else if (strcmp(argv[i], "-hostname") == 0)
+		{
+			print_hostname = 1;
+		}
+
 		else if (strcmp(argv[i], "-check") == 0)
 		{
 			check = 1;
@@ -316,7 +323,7 @@ static void parse_args(int argc, char **argv)
 
 		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
 		{
-			fprintf(stderr,"Usage: %s [-nblocks n] [-nblocksx x] [-nblocksy y] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd]\n", argv[0]);
+			fprintf(stderr,"Usage: %s [-nblocks n] [-nblocksx x] [-nblocksy y] [-x x] [-y y] [-xy n] [-z z] [-size size] [-iter iter] [-bound] [-check] [-spmd] [-hostname]\n", argv[0]);
 			fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks, %u iterations\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, niter);
 			exit(EXIT_SUCCESS);
 		}
@@ -400,10 +407,19 @@ int main(int argc, char **argv)
 	if (bound)
 		starpu_bound_compute(&min, &min_int, 1);
 
-	PRINTF("# x\ty\tz\tms\tGFlops");
+	PRINTF("# ");
+	if (print_hostname)
+		PRINTF("node\t");
+	PRINTF("x\ty\tz\tms\tGFlops");
 	if (bound)
 		PRINTF("\tTms\tTGFlops\tTims\tTiGFlops");
 	PRINTF("\n");
+	if (print_hostname)
+	{
+		char hostname[255];
+		gethostname(hostname, 255);
+		PRINTF("%s\t", hostname);
+	}
 	PRINTF("%u\t%u\t%u\t%.0f\t%.1f", xdim, ydim, zdim, timing/niter/1000.0, flops/timing/1000.0);
 	if (bound)
 		PRINTF("\t%.0f\t%.1f\t%.0f\t%.1f", min, flops/min/1000000.0, min_int, flops/min_int/1000000.0);

+ 2 - 2
examples/scheduler/schedulers.sh

@@ -3,7 +3,7 @@
 #
 # Copyright (C) 2012                                     Inria
 # Copyright (C) 2012-2015,2017,2018                      CNRS
-# Copyright (C) 2012,2017,2019                           Université de Bordeaux
+# Copyright (C) 2012,2017,2019-2020                      Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -40,6 +40,6 @@ fi
 for sched in $SCHEDULERS
 do
     echo "cholesky.$sched"
-    STARPU_SCHED=$sched ./cholesky/cholesky_tag -size $((960*3)) -nblocks 3
+    STARPU_SCHED=$sched $STARPU_LAUNCH ./cholesky/cholesky_tag -size $((960*3)) -nblocks 3
     check_success $?
 done

+ 2 - 2
examples/scheduler/schedulers_context.sh

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012,2014,2015,2017,2018                 CNRS
-# Copyright (C) 2017,2019                                Université de Bordeaux
+# Copyright (C) 2017,2019-2020                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -39,6 +39,6 @@ fi
 for sched in $SCHEDULERS
 do
     echo "sched_ctx.$sched"
-    STARPU_SCHED=$sched ./sched_ctx/sched_ctx
+    STARPU_SCHED=$sched $STARPU_LAUNCH ./sched_ctx/sched_ctx
     check_success $?
 done

+ 5 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2010-2017,2019                           CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,6 +44,7 @@
 #undef STARPU_SIMGRID_MC
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 #undef STARPU_HAVE_SIMGRID_MSG_H
+#undef STARPU_HAVE_MSG_MSG_H
 #undef STARPU_HAVE_SIMGRID_ACTOR_H
 #undef STARPU_HAVE_SIMGRID_SEMAPHORE_H
 #undef STARPU_HAVE_SIMGRID_MUTEX_H
@@ -166,6 +167,9 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_HAVE_PTHREAD_BARRIER
 #undef STARPU_HAVE_PTHREAD_SETNAME_NP
 #undef STARPU_HAVE_STRUCT_TIMESPEC
+#undef STARPU_PTHREAD_MUTEX_INITIALIZER_ZERO
+#undef STARPU_PTHREAD_COND_INITIALIZER_ZERO
+#undef STARPU_PTHREAD_RWLOCK_INITIALIZER_ZERO
 
 /* This is only for building examples */
 #undef STARPU_HAVE_HELGRIND_H

+ 8 - 2
include/starpu_task.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017,2019                           Inria
+ * Copyright (C) 2011-2017,2020                           Inria
  * Copyright (C) 2009-2019                                Université de Bordeaux
- * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2010-2015,2017,2018,2019,2020            CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
  *
@@ -1667,6 +1667,12 @@ void starpu_task_ft_failed(struct starpu_task *task);
  */
 void starpu_task_ft_success(struct starpu_task *meta_task);
 
+/**
+   Set the function to call when the watchdog detects that StarPU has
+   not finished any task for STARPU_WATCHDOG_TIMEOUT seconds
+*/
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg);
+
 /** @} */
 
 #ifdef __cplusplus

+ 10 - 4
include/starpu_thread.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2013,2015-2017                           Inria
  * Copyright (C) 2010-2015,2017,2019                           CNRS
- * Copyright (C) 2010,2012-2019                           Université de Bordeaux
+ * Copyright (C) 2010,2012-2020                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,10 @@
 #include <starpu_util.h>
 #ifdef STARPU_SIMGRID
 #include <pthread.h>
-#ifdef STARPU_HAVE_XBT_SYNCHRO_H
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+#include <simgrid/mutex.h>
+#include <simgrid/cond.h>
+#elif defined(STARPU_HAVE_XBT_SYNCHRO_H)
 #include <xbt/synchro.h>
 #else
 #include <xbt/synchro_core.h>
@@ -45,9 +48,12 @@
 #ifdef STARPU_HAVE_SIMGRID_BARRIER_H
 #include <simgrid/barrier.h>
 #endif
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+#include <simgrid/host.h>
+#endif
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #include <simgrid/msg.h>
-#else
+#elif defined(STARPU_HAVE_MSG_MSG_H)
 #include <msg/msg.h>
 #endif
 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
@@ -81,7 +87,7 @@ typedef msg_host_t starpu_sg_host_t;
 #endif
 int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
 starpu_pthread_t starpu_pthread_self(void);
-int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host);
+int starpu_pthread_create_on(const char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host);
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 starpu_pthread_t _starpu_simgrid_actor_create(const char *name, xbt_main_func_t code, starpu_sg_host_t host, int argc, char *argv[]);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);

+ 53 - 5
include/starpu_thread_util.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2013                                Inria
- * Copyright (C) 2010-2013,2015,2017,2019                      CNRS
- * Copyright (C) 2010-2014,2016,2017                      Université de Bordeaux
+ * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
+ * Copyright (C) 2010-2014,2016,2017,2020                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -64,7 +64,7 @@
  * Encapsulation of the starpu_pthread_mutex_* functions.
  */
 
-#define STARPU_PTHREAD_MUTEX_INIT(mutex, attr) do {                           \
+#define _STARPU_PTHREAD_MUTEX_INIT(mutex, attr) do {                           \
 	int p_ret = starpu_pthread_mutex_init((mutex), (attr));                \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
@@ -74,6 +74,22 @@
 	}                                                                      \
 } while (0)
 
+#ifdef STARPU_PTHREAD_MUTEX_INITIALIZER_ZERO
+#define STARPU_PTHREAD_MUTEX_INIT(mutex, attr) do {                            \
+	if (!attr)                                                             \
+		memset(mutex, 0, sizeof(*mutex));                              \
+	else                                                                   \
+		_STARPU_PTHREAD_MUTEX_INIT(mutex, attr);                       \
+} while (0)
+#define STARPU_PTHREAD_MUTEX_INIT0(mutex, attr) do {                           \
+	if (attr)                                                              \
+		_STARPU_PTHREAD_MUTEX_INIT(mutex, attr);                       \
+} while (0)
+#else
+#define STARPU_PTHREAD_MUTEX_INIT(mutex, attr) _STARPU_PTHREAD_MUTEX_INIT(mutex, attr)
+#define STARPU_PTHREAD_MUTEX_INIT0(mutex, attr) _STARPU_PTHREAD_MUTEX_INIT(mutex, attr)
+#endif
+
 #define STARPU_PTHREAD_MUTEX_DESTROY(mutex) do {                              \
 	int p_ret = starpu_pthread_mutex_destroy(mutex);                       \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
@@ -199,7 +215,7 @@ int _starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex, char *fil
 /*
  * Encapsulation of the starpu_pthread_rwlock_* functions.
  */
-#define STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) do {                          \
+#define _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) do {                         \
 	int p_ret = starpu_pthread_rwlock_init((rwlock), (attr));              \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
@@ -209,6 +225,22 @@ int _starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex, char *fil
 	}                                                                      \
 } while (0)
 
+#ifdef STARPU_PTHREAD_RWLOCK_INITIALIZER_ZERO
+#define STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) do {                            \
+	if (!attr)                                                             \
+		memset(rwlock, 0, sizeof(*rwlock));                              \
+	else                                                                   \
+		_STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr);                       \
+} while (0)
+#define STARPU_PTHREAD_RWLOCK_INIT0(rwlock, attr) do {                           \
+	if (attr)                                                              \
+		_STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr);                       \
+} while (0)
+#else
+#define STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr) _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr)
+#define STARPU_PTHREAD_RWLOCK_INIT0(rwlock, attr) _STARPU_PTHREAD_RWLOCK_INIT(rwlock, attr)
+#endif
+
 #define STARPU_PTHREAD_RWLOCK_RDLOCK(rwlock) do {                              \
 	int p_ret = starpu_pthread_rwlock_rdlock(rwlock);                      \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
@@ -282,7 +314,7 @@ int _starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock, char *file
 /*
  * Encapsulation of the starpu_pthread_cond_* functions.
  */
-#define STARPU_PTHREAD_COND_INIT(cond, attr) do {                             \
+#define _STARPU_PTHREAD_COND_INIT(cond, attr) do {                             \
 	int p_ret = starpu_pthread_cond_init((cond), (attr));                  \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 		fprintf(stderr,                                                \
@@ -292,6 +324,22 @@ int _starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock, char *file
 	}                                                                      \
 } while (0)
 
+#ifdef STARPU_PTHREAD_COND_INITIALIZER_ZERO
+#define STARPU_PTHREAD_COND_INIT(cond, attr) do {                            \
+	if (!attr)                                                             \
+		memset(cond, 0, sizeof(*cond));                              \
+	else                                                                   \
+		_STARPU_PTHREAD_COND_INIT(cond, attr);                       \
+} while (0)
+#define STARPU_PTHREAD_COND_INIT0(cond, attr) do {                           \
+	if (attr)                                                              \
+		_STARPU_PTHREAD_COND_INIT(cond, attr);                       \
+} while (0)
+#else
+#define STARPU_PTHREAD_COND_INIT(cond, attr) _STARPU_PTHREAD_COND_INIT(cond, attr)
+#define STARPU_PTHREAD_COND_INIT0(cond, attr) _STARPU_PTHREAD_COND_INIT(cond, attr)
+#endif
+
 #define STARPU_PTHREAD_COND_DESTROY(cond) do {                                \
 	int p_ret = starpu_pthread_cond_destroy(cond);                         \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \

+ 21 - 1
m4/libs.m4

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2011                                     Inria
 # Copyright (C) 2012,2017                                CNRS
-# Copyright (C) 2011,2014                                Université de Bordeaux
+# Copyright (C) 2011,2014,2020                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -53,3 +53,23 @@ AC_DEFUN([STARPU_CHECK_LIB], [dnl
 AC_DEFUN([STARPU_HAVE_LIBRARY], [dnl
 STARPU_CHECK_LIB([$1], [$2], main, [$3], [$4], [$5])
 ])dnl
+
+# STARPU_INIT_ZERO(INCLUDES, TYPE, INIT_MACRO)
+# Checks whether when TYPE is initialized with INIT_MACRO, the content is just
+# plain zeroes
+AC_DEFUN([STARPU_INIT_ZERO], [dnl
+AC_MSG_CHECKING(whether $3 just zeroes)
+AC_RUN_IFELSE([AC_LANG_PROGRAM(
+		$1,
+		[[$2 var = $3;
+		 char *p;
+		 for (p = (char*) &var; p < (char*) (&var+1); p++)
+		   if (*p != 0)
+		     return 1;
+		 return 0;
+		]],
+		)],
+		[AC_DEFINE([STARPU_$3_ZERO], [1], [Define to 1 if `$3' is just zeroes])
+		 AC_MSG_RESULT(yes)],
+		[AC_MSG_RESULT(no)])
+])dnl

+ 1 - 5
mpi/examples/Makefile.am

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2012,2014,2016                           Inria
 # Copyright (C) 2010-2017,2019                           CNRS
-# Copyright (C) 2009-2017,2019                           Université de Bordeaux
+# Copyright (C) 2009-2017,2019-2020                      Université de Bordeaux
 # Copyright (C) 2013                                     Thibaut Lambert
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -248,13 +248,11 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
-if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 endif
 endif
-endif
 
 ########################
 # MPI Matrix mult example #
@@ -336,11 +334,9 @@ complex_mpi_complex_SOURCES =		\
 	complex/mpi_complex.c		\
 	../../examples/interface/complex_interface.c
 
-if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
 endif
-endif
 
 #########################
 # user_datatype example #

+ 3 - 3
mpi/examples/perf.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010,2011,2014                           Université de Bordeaux
+# Copyright (C) 2010,2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,7 @@ ncalibrate=0
 for i in `seq 1 $ncalibrate`
 do
 echo "STARPU_CALIBRATE $i/$ncalibrate"
-STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
+STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes $STARPU_LAUNCH ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
 done
 
 func()
@@ -57,7 +57,7 @@ echo "*******************************************">> log
 cat log
 cat log >> log.all
 
-STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
+STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np $STARPU_LAUNCH ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
 cat log.out > log
 cat log.err >> log
 cat log

+ 2 - 0
mpi/examples/user_datatype/my_interface.h

@@ -59,6 +59,7 @@ static struct starpu_codelet starpu_my_data_display_codelet =
 	.cpu_funcs_name = {"starpu_my_data_display_codelet_cpu"},
 	.nbuffers = 1,
 	.modes = {STARPU_R},
+	.model = &starpu_perfmodel_nop,
 	.name = "starpu_my_data_display_codelet"
 };
 
@@ -68,6 +69,7 @@ static struct starpu_codelet starpu_my_data_compare_codelet =
 	.cpu_funcs_name = {"starpu_my_data_compare_codelet_cpu"},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R},
+	.model = &starpu_perfmodel_nop,
 	.name = "starpu_my_data_compare_codelet"
 };
 

+ 70 - 34
mpi/src/mpi/starpu_mpi_mpi.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2012,2013,2016,2017                      Inria
  * Copyright (C) 2017                                     Guillaume Beauchamp
  *
@@ -19,6 +19,10 @@
 
 #include <stdlib.h>
 #include <limits.h>
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <starpu_mpi_private.h>
@@ -33,7 +37,6 @@
 #include <mpi/starpu_mpi_tag.h>
 #include <mpi/starpu_mpi_comm.h>
 #include <starpu_mpi_init.h>
-#include <common/config.h>
 #include <common/thread.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/coherency.h>
@@ -325,7 +328,7 @@ static void _starpu_mpi_simgrid_wait_req_func(void* arg)
 	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
 
 	*(sim_req->done) = 1;
-	starpu_pthread_queue_signal(sim_req->queue);
+	starpu_pthread_queue_broadcast(sim_req->queue);
 
 	free(sim_req);
 
@@ -501,10 +504,10 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 	{
 		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.node.comm);
 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.node.comm, &req->backend->data_request);
+	}
 #ifdef STARPU_SIMGRID
-		_starpu_mpi_simgrid_wait_req(&req->backend->data_request, &req->status_store, &req->queue, &req->done);
+	_starpu_mpi_simgrid_wait_req(&req->backend->data_request, &req->status_store, &req->queue, &req->done);
 #endif
-	}
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
@@ -526,6 +529,7 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 /*                                                      */
 /********************************************************/
 
+#ifndef STARPU_SIMGRID
 void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
 	_STARPU_MPI_LOG_IN();
@@ -535,10 +539,6 @@ void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 	if (req->backend->data_request != MPI_REQUEST_NULL)
 	{
-		// TODO: Fix for STARPU_SIMGRID
-#ifdef STARPU_SIMGRID
-		STARPU_MPI_ASSERT_MSG(0, "Implement this in STARPU_SIMGRID");
-#endif
 		req->ret = MPI_Wait(&req->backend->data_request, waiting_req->status);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
@@ -548,15 +548,36 @@ void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 
 	_STARPU_MPI_LOG_OUT();
 }
+#endif
 
 int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 {
 	int ret;
 	struct _starpu_mpi_req *req = *public_req;
-	struct _starpu_mpi_req *waiting_req;
 
 	_STARPU_MPI_LOG_IN();
 
+#ifdef STARPU_SIMGRID
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
+	starpu_pthread_wait_t wait;
+	starpu_pthread_wait_init(&wait);
+	starpu_pthread_queue_register(&wait, &req->queue);
+	while (1)
+	{
+		starpu_pthread_wait_reset(&wait);
+		if (req->done)
+			break;
+		starpu_pthread_wait_wait(&wait);
+	}
+	starpu_pthread_queue_unregister(&wait, &req->queue);
+	starpu_pthread_wait_destroy(&wait);
+	_STARPU_MPI_TRACE_UWAIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
+
+	if (status)
+		*status = req->status_store;
+	_starpu_mpi_handle_request_termination(req);
+#else
+	struct _starpu_mpi_req *waiting_req;
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
 	STARPU_PTHREAD_MUTEX_LOCK(&(req->backend->req_mutex));
@@ -580,16 +601,17 @@ int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 		STARPU_PTHREAD_COND_WAIT(&req->backend->req_cond, &req->backend->req_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 
-	ret = req->ret;
-
 	/* The internal request structure was automatically allocated */
+	_starpu_mpi_request_destroy(waiting_req);
+#endif
+
 	*public_req = NULL;
 	if (req->backend->internal_req)
 	{
 		_starpu_mpi_request_destroy(req->backend->internal_req);
 	}
+	ret = req->ret;
 	_starpu_mpi_request_destroy(req);
-	_starpu_mpi_request_destroy(waiting_req);
 
 	_STARPU_MPI_LOG_OUT();
 	return ret;
@@ -601,6 +623,7 @@ int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 /*                                                      */
 /********************************************************/
 
+#ifndef STARPU_SIMGRID
 void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
 	_STARPU_MPI_LOG_IN();
@@ -613,12 +636,7 @@ void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
-#ifdef STARPU_SIMGRID
-	req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, testing_req->flag);
-	memcpy(testing_req->status, &req->status_store, sizeof(*testing_req->status));
-#else
 	req->ret = MPI_Test(&req->backend->data_request, testing_req->flag, testing_req->status);
-#endif
 
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
@@ -636,6 +654,7 @@ void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->backend->req_mutex);
 	_STARPU_MPI_LOG_OUT();
 }
+#endif
 
 int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 {
@@ -648,6 +667,15 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 
+#ifdef STARPU_SIMGRID
+	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
+	if (*flag)
+	{
+		if (status)
+			*status = req->status_store;
+		_starpu_mpi_handle_request_termination(req);
+	}
+#else
 	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	unsigned submitted = req->submitted;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
@@ -676,25 +704,26 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 		ret = testing_req->ret;
 
-		if (*(testing_req->flag))
-		{
-			/* The request was completed so we free the internal
-			 * request structure which was automatically allocated
-			 * */
-			*public_req = NULL;
-			if (req->backend->internal_req)
-			{
-				_starpu_mpi_request_destroy(req->backend->internal_req);
-			}
-			_starpu_mpi_request_destroy(req);
-		}
-
 		_starpu_mpi_request_destroy(testing_req);
 	}
 	else
 	{
 		*flag = 0;
 	}
+#endif
+
+	if (*flag)
+	{
+		/* The request was completed so we free the internal
+		 * request structure which was automatically allocated
+		 * */
+		*public_req = NULL;
+		if (req->backend->internal_req)
+		{
+			_starpu_mpi_request_destroy(req->backend->internal_req);
+		}
+		_starpu_mpi_request_destroy(req);
+	}
 
 	_STARPU_MPI_LOG_OUT();
 	return ret;
@@ -930,6 +959,9 @@ static void _starpu_mpi_early_data_cb(void* arg)
 			args->req->submitted = 1;
 			STARPU_PTHREAD_COND_BROADCAST(&args->req->backend->req_cond);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->backend->req_mutex);
+#ifdef STARPU_SIMGRID
+			args->req->done = 1;
+#endif
 		}
 	}
 
@@ -1133,7 +1165,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)
@@ -1150,13 +1184,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	int i;
 	for (i = 0; i < *(argc_argv->argc); i++)
 		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
+	void **tsd;
+	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
 #ifdef HAVE_SG_ACTOR_DATA
 	_starpu_simgrid_actor_create("main", smpi_simulated_main_, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
+	/* And set TSD for us */
+	sg_actor_data_set(sg_actor_self(), tsd);
 #else
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	/* And set TSD for us */
-	void **tsd;
-	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
 	if (!smpi_process_set_user_data)
 	{
 		_STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");

+ 8 - 8
mpi/src/mpi/starpu_mpi_mpi_backend.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2017                                     Inria
  * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
- * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ * Copyright (C) 2009-2014,2017,2018-2020                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,21 +51,21 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_CALLOC(req->backend, 1, sizeof(struct _starpu_mpi_req_backend));
 
-	req->backend->data_request = 0;
+	//req->backend->data_request = 0;
 
 	STARPU_PTHREAD_MUTEX_INIT(&req->backend->req_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&req->backend->req_cond, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&req->backend->posted_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&req->backend->posted_cond, NULL);
 
-	req->backend->other_request = NULL;
+	//req->backend->other_request = NULL;
 
-	req->backend->size_req = 0;
-	req->backend->internal_req = NULL;
-	req->backend->is_internal_req = 0;
+	//req->backend->size_req = 0;
+	//req->backend->internal_req = NULL;
+	//req->backend->is_internal_req = 0;
 	req->backend->to_destroy = 1;
-	req->backend->early_data_handle = NULL;
-	req->backend->envelope = NULL;
+	//req->backend->early_data_handle = NULL;
+	//req->backend->envelope = NULL;
 }
 
 void _starpu_mpi_mpi_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req)

+ 8 - 3
mpi/src/nmad/starpu_mpi_nmad.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2017                                     Inria
  * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
- * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ * Copyright (C) 2009-2014,2017,2018-2020                 Université de Bordeaux
  * Copyright (C) 2017                                     Guillaume Beauchamp
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,6 +19,10 @@
 
 #include <stdlib.h>
 #include <limits.h>
+#include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <starpu_mpi_private.h>
@@ -28,7 +32,6 @@
 #include <starpu_mpi_cache.h>
 #include <starpu_mpi_select_node.h>
 #include <starpu_mpi_init.h>
-#include <common/config.h>
 #include <common/thread.h>
 #include <datawizard/coherency.h>
 #include <core/task.h>
@@ -414,7 +417,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
-		_STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
+		char hostname[65];
+		gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	if (_starpu_mpi_thread_cpuid >= 0)

+ 4 - 4
mpi/src/starpu_mpi.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2013,2016,2017                      Inria
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2018,2020                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -46,7 +46,7 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
 {
-	if (_starpu_mpi_fake_world_size != -1)
+	if (STARPU_UNLIKELY(_starpu_mpi_fake_world_size != -1))
 	{
 		/* Don't actually do the communication */
 		return NULL;
@@ -114,9 +114,9 @@ int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_
 	MPI_Status status;
 
 	_STARPU_MPI_LOG_IN();
-	memset(&status, 0, sizeof(MPI_Status));
-
 	starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
+
+	memset(&status, 0, sizeof(MPI_Status));
 	starpu_mpi_wait(&req, &status);
 
 	_STARPU_MPI_LOG_OUT();

+ 17 - 17
mpi/src/starpu_mpi_req.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2012,2013,2016,2017                      Inria
  * Copyright (C) 2017                                     Guillaume Beauchamp
  *
@@ -25,37 +25,37 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	_STARPU_MPI_CALLOC(*req, 1, sizeof(struct _starpu_mpi_req));
 
 	/* Initialize the request structure */
-	(*req)->data_handle = NULL;
-	(*req)->prio = 0;
+	//(*req)->data_handle = NULL;
+	//(*req)->prio = 0;
 
-	(*req)->datatype = 0;
-	(*req)->datatype_name = NULL;
-	(*req)->ptr = NULL;
+	//(*req)->datatype = 0;
+	//(*req)->datatype_name = NULL;
+	//(*req)->ptr = NULL;
 	(*req)->count = -1;
 	(*req)->registered_datatype = -1;
 
 	(*req)->node_tag.node.rank = -1;
 	(*req)->node_tag.data_tag = -1;
-	(*req)->node_tag.node.comm = 0;
+	//(*req)->node_tag.node.comm = 0;
 
-	(*req)->func = NULL;
+	//(*req)->func = NULL;
 
-	(*req)->status = NULL;
-	(*req)->flag = NULL;
+	//(*req)->status = NULL;
+	//(*req)->flag = NULL;
 	_starpu_mpi_req_multilist_init_coop_sends(*req);
 
 	(*req)->ret = -1;
 
 	(*req)->request_type = UNKNOWN_REQ;
 
-	(*req)->submitted = 0;
-	(*req)->completed = 0;
-	(*req)->posted = 0;
+	//(*req)->submitted = 0;
+	//(*req)->completed = 0;
+	//(*req)->posted = 0;
 
-	(*req)->sync = 0;
+	//(*req)->sync = 0;
 	(*req)->detached = -1;
-	(*req)->callback = NULL;
-	(*req)->callback_arg = NULL;
+	//(*req)->callback = NULL;
+	//(*req)->callback_arg = NULL;
 
 	(*req)->sequential_consistency = 1;
 	(*req)->pre_sync_jobid = -1;
@@ -64,7 +64,7 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 #ifdef STARPU_SIMGRID
 	starpu_pthread_queue_init(&((*req)->queue));
 	starpu_pthread_queue_register(&_starpu_mpi_thread_wait, &((*req)->queue));
-	(*req)->done = 0;
+	//(*req)->done = 0;
 #endif
 	_mpi_backend._starpu_mpi_backend_request_init(*req);
 }

+ 23 - 23
mpi/tests/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010-2019                                CNRS
-# Copyright (C) 2009-2018                                Université de Bordeaux
+# Copyright (C) 2009-2018, 2020                                Université de Bordeaux
 # Copyright (C) 2013                                     Thibaut Lambert
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -95,12 +95,17 @@ if BUILD_TESTS
 starpu_mpi_TESTS =
 
 starpu_mpi_TESTS +=				\
+	broadcast				\
 	cache					\
 	cache_disable				\
 	callback				\
+	driver					\
 	early_request				\
+	gather					\
+	gather2					\
 	insert_task				\
 	insert_task_block			\
+	insert_task_count			\
 	insert_task_dyn_handles			\
 	insert_task_node_choice			\
 	insert_task_owner			\
@@ -108,52 +113,47 @@ starpu_mpi_TESTS +=				\
 	insert_task_owner_data			\
 	matrix					\
 	matrix2					\
+	mpi_barrier				\
 	mpi_detached_tag			\
+	mpi_earlyrecv				\
+	mpi_irecv				\
 	mpi_irecv_detached			\
+	mpi_isend				\
 	mpi_isend_detached			\
 	mpi_reduction				\
+	mpi_redux				\
 	mpi_scatter_gather			\
+	mpi_test				\
+	multiple_send				\
+	pingpong				\
 	policy_register				\
 	policy_register_many			\
 	policy_selection			\
 	policy_selection2			\
+	ring					\
+	ring_async				\
 	ring_async_implicit			\
+	ring_sync				\
+	ring_sync_detached			\
 	temporary				\
-	early_stuff
+	user_defined_datatype			\
+	early_stuff				\
+	sendrecv_bench
 
 if !STARPU_SIMGRID
+# missing support in simgrid
 starpu_mpi_TESTS +=				\
 	attr					\
-	broadcast				\
-	pingpong				\
-	mpi_test				\
-	mpi_isend				\
-	mpi_earlyrecv				\
 	mpi_earlyrecv2				\
 	mpi_earlyrecv2_sync			\
-	mpi_irecv				\
-	mpi_barrier				\
-	mpi_redux				\
-	ring					\
-	ring_sync				\
-	ring_sync_detached			\
-	ring_async				\
 	block_interface				\
 	block_interface_pinned			\
-	matrix2					\
 	insert_task_compute			\
 	insert_task_sent_cache			\
 	insert_task_recv_cache			\
-	insert_task_count			\
 	insert_task_seq				\
-	multiple_send				\
-	user_defined_datatype			\
 	tags_checking				\
-	sync					\
-	gather					\
-	gather2					\
-	driver					\
-	sendrecv_bench
+	sync
 
 if STARPU_USE_MPI_MPI
 starpu_mpi_TESTS +=				\

+ 4 - 1
mpi/tests/driver.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2017,2018                                CNRS
- * Copyright (C) 2018                                     Université de Bordeaux
+ * Copyright (C) 2018,2020                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -100,6 +100,9 @@ int main(int argc, char **argv)
 			}
 		}
 		finished = request[0] == NULL && request[1] == NULL;
+#ifdef STARPU_SIMGRID
+		starpu_sleep(0.001);
+#endif
 	}
 
 	if (rank%2 == 0)

+ 4 - 1
mpi/tests/mpi_earlyrecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2015,2017                           CNRS
- * Copyright (C) 2009,2010,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2009,2010,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -98,6 +98,9 @@ int main(int argc, char **argv)
 			}
 		}
 		finished = request[0] == NULL && request[1] == NULL;
+#ifdef STARPU_SIMGRID
+		starpu_sleep(0.001);
+#endif
 	}
 
 	if (rank%2 == 0)

+ 2 - 2
mpi/tests/mpi_earlyrecv2.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2009,2010,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2009,2010,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Inria
  *
@@ -73,7 +73,7 @@ int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detac
 		}
 
 		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
-		usleep(2000000);
+		starpu_sleep(2);
 		for(i=1 ; i<NB ; i++)
 		{
 			if (detached)

+ 2 - 2
mpi/tests/mpi_earlyrecv2_sync.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2009,2010,2015,2018                      Université de Bordeaux
+ * Copyright (C) 2009,2010,2015,2018,2020                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -65,7 +65,7 @@ int exchange(int rank, starpu_data_handle_t *handles, check_func func)
 		STARPU_ASSERT(req[1] != NULL);
 
 		// We sleep to make sure that the data for the tag 8 and the tag 9 will be received before the recv are posted
-		usleep(2000000);
+		starpu_sleep(2);
 		for(i=2 ; i<NB ; i++)
 		{
 			starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);

+ 4 - 1
mpi/tests/mpi_test.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2011,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2010,2011,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
@@ -79,6 +79,9 @@ int main(int argc, char **argv)
 		{
 			MPI_Status status;
 			starpu_mpi_test(&req, &finished, &status);
+#ifdef STARPU_SIMGRID
+			starpu_sleep(0.001);
+#endif
 		}
 		while (!finished);
 	}

+ 4 - 1
mpi/tests/multiple_send.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2011-2013,2015,2017                      CNRS
- * Copyright (C) 2011,2015,2018                           Université de Bordeaux
+ * Copyright (C) 2011,2015,2018,2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -80,6 +80,9 @@ int main(int argc, char **argv)
 					}
 				}
 			}
+#ifdef STARPU_SIMGRID
+			starpu_sleep(0.001);
+#endif
 		}
 	}
 	FPRINTF(stderr, "[%d] All requests finished\n", rank);

+ 4 - 4
mpi/tests/pingpong.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
@@ -153,7 +153,7 @@ int main(int argc, char **argv)
 				starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 			}
 
-			usleep(sleep_time * 1000);
+			starpu_sleep(sleep_time / 1000);
 		}
 	}
 	else // broadcasts
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 					if (r != rank)
 					{
 						starpu_mpi_send(tab_handle, r, (r * niter) + loop, MPI_COMM_WORLD);
-						usleep(sleep_time * 1000);
+						starpu_sleep(sleep_time / 1000);
 					}
 				}
 			}
@@ -178,7 +178,7 @@ int main(int argc, char **argv)
 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
 
 				for (int r = 0; r < (size-1); r++)
-					usleep(sleep_time * 1000);
+					starpu_sleep(sleep_time / 1000);
 			}
 		}
 	}

+ 10 - 1
mpi/tests/sendrecv_bench.c

@@ -95,7 +95,16 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
 
-	STARPU_ASSERT_MSG(worldsize == 2, "We need two prcesses.");
+	if (worldsize != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
 
 
 	if (rank == 0)

+ 2 - 1
src/common/fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2013,2015                           Inria
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2010-2018                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
+#include <core/simgrid.h>
 #include <starpu_util.h>
 #include <starpu_profiling.h>
 

+ 2 - 2
src/common/graph.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016,2017                                CNRS
  * Copyright (C) 2017                                     Inria
- * Copyright (C) 2016-2018                                Université de Bordeaux
+ * Copyright (C) 2016-2018,2020                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -140,7 +140,7 @@ void _starpu_graph_add_job(struct _starpu_job *job)
 	_STARPU_CALLOC(node, 1, sizeof(*node));
 	node->job = job;
 	job->graph_node = node;
-	STARPU_PTHREAD_MUTEX_INIT(&node->mutex, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&node->mutex, NULL);
 
 	_starpu_graph_wrlock();
 

+ 7 - 2
src/common/list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2018,2020                           Université de Bordeaux
  * Copyright (C) 2010-2012,2015-2018                      CNRS
  * Copyright (C) 2017                                     Inria
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -51,6 +51,9 @@
  *   * Initializes a list (initially empty)
  *   void		FOO_list_init(struct FOO_list*);
  *
+ *   * Initializes a list (initially empty), assuming that the content of FOO_list was already zeroed
+ *   void		FOO_list_init0(struct FOO_list*);
+ *
  *   * Suppresses a liste
  *   void		FOO_list_delete(struct FOO_list*);
  *
@@ -225,7 +228,9 @@
   /** @internal */LIST_INLINE struct ENAME *ENAME##_list_back(const struct ENAME##_list *l) \
     { return l->_tail; } \
   /** @internal */LIST_INLINE void ENAME##_list_init(struct ENAME##_list *l) \
-    { l->_head=NULL; l->_tail=l->_head; } \
+    { l->_head=NULL; l->_tail=NULL; } \
+  /** @internal */LIST_INLINE void ENAME##_list_init0(struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
+    { } \
   /** @internal */LIST_INLINE struct ENAME##_list *ENAME##_list_new(void) \
     { struct ENAME##_list *l; _STARPU_MALLOC(l, sizeof(struct ENAME##_list)); \
       ENAME##_list_init(l); return l; } \

+ 14 - 4
src/common/prio_list.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2017,2018                                Inria
  * Copyright (C) 2016,2017                                CNRS
- * Copyright (C) 2015-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2015-2017,2019-2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,6 +37,9 @@
  * * Initialize a new priority list
  * void FOO_prio_list_init(struct FOO_prio_list*)
  *
+ * * Initialize a new priority list, assuming that the content of FOO_prio_list was already zeroed
+ * void FOO_prio_list_init0(struct FOO_prio_list*)
+ *
  * * Free an empty priority list
  * void FOO_prio_list_deinit(struct FOO_prio_list*)
  *
@@ -152,6 +155,11 @@
 		starpu_rbtree_init(&priolist->tree); \
 		priolist->empty = 1; \
 	} \
+	PRIO_LIST_INLINE void ENAME##_prio_list_init0(struct ENAME##_prio_list *priolist) \
+	{ \
+		starpu_rbtree_init0(&priolist->tree); \
+		priolist->empty = 1; \
+	} \
 	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ \
 		if (starpu_rbtree_empty(&priolist->tree)) \
@@ -183,10 +191,10 @@
 		if (node) \
 			stage = ENAME##_node_to_list_stage(node); \
 		else { \
-			_STARPU_MALLOC(stage, sizeof(*stage));	\
-			starpu_rbtree_node_init(&stage->node); \
+			_STARPU_CALLOC(stage, 1, sizeof(*stage));	\
+			starpu_rbtree_node_init0(&stage->node); \
 			stage->prio = prio; \
-			ENAME##_list_init(&stage->list); \
+			ENAME##_list_init0(&stage->list); \
 			starpu_rbtree_insert_slot(&priolist->tree, slot, &stage->node); \
 		} \
 		return stage; \
@@ -469,6 +477,8 @@
 	struct ENAME##_prio_list { struct ENAME##_list list; }; \
 	PRIO_LIST_INLINE void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
 	{ ENAME##_list_init(&(priolist)->list); } \
+	PRIO_LIST_INLINE void ENAME##_prio_list_init0(struct ENAME##_prio_list *priolist) \
+	{ ENAME##_list_init0(&(priolist)->list); } \
 	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
 	PRIO_LIST_INLINE void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \

+ 21 - 0
src/common/rbtree.h

@@ -34,6 +34,8 @@
 #include <stdint.h>
 #include <sys/types.h>
 
+#include <starpu_util.h>
+
 #define MACRO_BEGIN ({
 #define MACRO_END })
 /*
@@ -68,6 +70,13 @@ static inline void starpu_rbtree_init(struct starpu_rbtree *tree)
 }
 
 /*
+ * This version assumes that the content of tree was already zeroed
+ */
+static inline void starpu_rbtree_init0(struct starpu_rbtree *tree STARPU_ATTRIBUTE_UNUSED)
+{
+}
+
+/*
  * Initialize a node.
  *
  * A node is in no tree when its parent points to itself.
@@ -82,6 +91,18 @@ static inline void starpu_rbtree_node_init(struct starpu_rbtree_node *node)
 }
 
 /*
+ * This version assumes that the content of node was already zeroed
+ */
+static inline void starpu_rbtree_node_init0(struct starpu_rbtree_node *node)
+{
+    assert(starpu_rbtree_check_alignment(node));
+
+    node->parent = (uintptr_t)node | STARPU_RBTREE_COLOR_RED;
+    //node->children[STARPU_RBTREE_LEFT] = NULL;
+    //node->children[STARPU_RBTREE_RIGHT] = NULL;
+}
+
+/*
  * Return true if node is in no tree.
  */
 static inline int starpu_rbtree_node_unlinked(const struct starpu_rbtree_node *node)

+ 6 - 3
src/common/thread.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2013,2015,2017                           Inria
  * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2010,2012-2019                           Université de Bordeaux
+ * Copyright (C) 2010,2012-2020                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,7 +29,10 @@
 #include <limits.h>
 
 #ifdef STARPU_SIMGRID
-#ifdef STARPU_HAVE_XBT_SYNCHRO_H
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+#include <simgrid/mutex.h>
+#include <simgrid/cond.h>
+#elif defined(STARPU_HAVE_XBT_SYNCHRO_H)
 #include <xbt/synchro.h>
 #else
 #include <xbt/synchro_core.h>
@@ -72,7 +75,7 @@ starpu_pthread_t starpu_pthread_self(void)
 #endif
 }
 
-int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host)
+int starpu_pthread_create_on(const char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host)
 {
 	char **_args;
 	_STARPU_MALLOC(_args, 3*sizeof(char*));

+ 10 - 10
src/core/dependencies/cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012                                     Inria
- * Copyright (C) 2010-2012,2014-2018                      Université de Bordeaux
+ * Copyright (C) 2010-2012,2014-2018,2020                 Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,23 +24,23 @@
 #include <core/dependencies/cg.h>
 #include <core/dependencies/tags.h>
 
-void _starpu_cg_list_init(struct _starpu_cg_list *list)
+void _starpu_cg_list_init0(struct _starpu_cg_list *list)
 {
 	_starpu_spin_init(&list->lock);
-	list->ndeps = 0;
-	list->ndeps_completed = 0;
+	//list->ndeps = 0;
+	//list->ndeps_completed = 0;
 #ifdef STARPU_DEBUG
-	list->deps = NULL;
-	list->done = NULL;
+	//list->deps = NULL;
+	//list->done = NULL;
 #endif
 
-	list->terminated = 0;
+	//list->terminated = 0;
 
-	list->nsuccs = 0;
+	//list->nsuccs = 0;
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 	/* this is a small initial default value ... may be changed */
-	list->succ_list_size = 0;
-	list->succ = NULL;
+	//list->succ_list_size = 0;
+	//list->succ = NULL;
 #endif
 }
 

+ 2 - 2
src/core/dependencies/cg.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2018,2020                           Université de Bordeaux
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -114,7 +114,7 @@ void _starpu_notify_dependencies(struct _starpu_job *j);
 void _starpu_job_notify_start(struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch);
 void _starpu_job_notify_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data);
 
-void _starpu_cg_list_init(struct _starpu_cg_list *list);
+void _starpu_cg_list_init0(struct _starpu_cg_list *list);
 void _starpu_cg_list_deinit(struct _starpu_cg_list *list);
 int _starpu_add_successor_to_cg_list(struct _starpu_cg_list *successors, struct _starpu_cg *cg);
 int _starpu_list_task_successors_in_cg_list(struct _starpu_cg_list *successors, unsigned ndeps, struct starpu_task *task_array[]);

+ 44 - 16
src/core/dependencies/implicit_data_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2016                           Inria
- * Copyright (C) 2010-2019                                Université de Bordeaux
+ * Copyright (C) 2010-2020                                Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -49,8 +49,8 @@ static void _starpu_add_dependency(starpu_data_handle_t handle, struct starpu_ta
 	_starpu_add_ghost_dependency(handle, _starpu_get_job_associated_to_task(previous)->job_id, next);
 }
 
-/* Add pre_sync_task as new accessor among the existing ones, making it depend on the last synchronization task if any.  */
-static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot)
+/* Add post_sync_task as new accessor among the existing ones, making pre_sync_task depend on the last synchronization task if any.  */
+static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, int *submit_pre_sync, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot)
 {
 	/* Add this task to the list of readers */
 	STARPU_ASSERT(!post_sync_task_dependency_slot->prev);
@@ -64,6 +64,7 @@ static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task
 	/* This task depends on the previous synchronization task if any */
 	if (handle->last_sync_task && handle->last_sync_task != post_sync_task)
 	{
+		*submit_pre_sync= 1;
 		struct starpu_task *task_array[1] = {handle->last_sync_task};
 		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
 		_starpu_add_dependency(handle, handle->last_sync_task, pre_sync_task);
@@ -93,7 +94,7 @@ static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task
 		_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_sync_id, pre_sync_task);
 	}
 
-	if (!pre_sync_task->cl)
+	if (*submit_pre_sync && !pre_sync_task->cl)
 	{
 		/* Add a reference to be released in _starpu_handle_job_termination */
 		_starpu_spin_lock(&handle->header_lock);
@@ -202,7 +203,14 @@ static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_tas
  * */
 /* NB : handle->sequential_consistency_mutex must be hold by the caller;
  * returns a task, to be submitted after releasing that mutex. */
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
+/* *submit_pre_sync is whether the pre_sync_task will be submitted or not. The
+ * caller should set it to 1 if it intends to submit it anyway, or to 0
+ * if it may not submit it (because it has no other use for the task than
+ * synchronization). In the latter case,
+ * _starpu_detect_implicit_data_deps_with_handle will set it to 1 in case the
+ * task really needs to be submitted, or leave it to 0 if there is nothing to be
+ * waited for anyway. */
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, int *submit_pre_sync, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 								  starpu_data_handle_t handle, enum starpu_data_access_mode mode, unsigned task_handle_sequential_consistency)
 {
 	struct starpu_task *task = NULL;
@@ -228,8 +236,14 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
 		/* Skip tasks that are associated to a reduction phase so that
 		 * they do not interfere with the application. */
-		if (pre_sync_job->reduction_task || post_sync_job->reduction_task)
+		if (pre_sync_job->reduction_task) {
+			*submit_pre_sync = 1;
 			return NULL;
+		}
+		if (post_sync_job->reduction_task) {
+			*submit_pre_sync = 0;
+			return NULL;
+		}
 
 		/* In case we are generating the DAG, we add an implicit
 		 * dependency between the pre and the post sync tasks in case
@@ -264,7 +278,9 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		{
 			_STARPU_DEP_DEBUG("concurrently\n");
 			/* Can access concurrently with current tasks */
-			_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
+			if (handle->last_sync_task != NULL)
+				*submit_pre_sync = 1;
+			_starpu_add_accessor(handle, pre_sync_task, submit_pre_sync, post_sync_task, post_sync_task_dependency_slot);
 		}
 		else
 		{
@@ -277,6 +293,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					|| (l != &handle->last_submitted_accessors && handle->last_submitted_ghost_accessors_id))
 			{
 				/* Several previous accessors */
+				*submit_pre_sync = 1;
 
 				if (mode == STARPU_W)
 				{
@@ -308,7 +325,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					/* Make this task wait for the previous ones */
 					_starpu_add_sync_task(handle, sync_task, sync_task, post_sync_task);
 					/* And the requested task wait for this one */
-					_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
+					_starpu_add_accessor(handle, pre_sync_task, submit_pre_sync, post_sync_task, post_sync_task_dependency_slot);
 
 					task = sync_task;
 				}
@@ -321,6 +338,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 				{
 					/* One accessor, make it the sync task,
 					 * and start depending on it. */
+					*submit_pre_sync = 1;
 					_STARPU_DEP_DEBUG("One previous accessor, depending on it\n");
 					handle->last_sync_task = l->task;
 					l->next = NULL;
@@ -343,10 +361,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 				{
 					_STARPU_DEP_DEBUG("No previous accessor, no dependency\n");
 				}
-				_starpu_add_accessor(handle, pre_sync_task, post_sync_task, post_sync_task_dependency_slot);
+				_starpu_add_accessor(handle, pre_sync_task, submit_pre_sync, post_sync_task, post_sync_task_dependency_slot);
 			}
 		}
 		handle->last_submitted_mode = mode;
+	} else {
+		*submit_pre_sync = 0;
 	}
         _STARPU_LOG_OUT();
 	return task;
@@ -423,9 +443,10 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 		STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 		unsigned index = descrs[buffer].index;
 		unsigned task_handle_sequential_consistency = task->handles_sequential_consistency ? task->handles_sequential_consistency[index] : handle->sequential_consistency;
+		int submit_pre_sync = 1;
 		if (!task_handle_sequential_consistency)
 			j->sequential_consistency = 0;
-		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, &dep_slots[buffer], handle, mode, task_handle_sequential_consistency);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(task, &submit_pre_sync, task, &dep_slots[buffer], handle, mode, task_handle_sequential_consistency);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task)
 		{
@@ -631,6 +652,7 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 	if (sequential_consistency)
 	{
 		struct starpu_task *sync_task, *new_task;
+		int submit_pre_sync = 0;
 		sync_task = starpu_task_create();
 		sync_task->name = sync_name;
 		sync_task->detach = 0;
@@ -639,7 +661,7 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */
-		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, &_starpu_get_job_associated_to_task(sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, &submit_pre_sync, sync_task, &_starpu_get_job_associated_to_task(sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task)
@@ -648,11 +670,17 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_d
 			STARPU_ASSERT(!ret);
 		}
 
-		/* TODO detect if this is superflous */
-		int ret = _starpu_task_submit_internally(sync_task);
-		STARPU_ASSERT(!ret);
-		ret = starpu_task_wait(sync_task);
-		STARPU_ASSERT(ret == 0);
+		if (submit_pre_sync)
+		{
+			int ret = _starpu_task_submit_internally(sync_task);
+			STARPU_ASSERT(!ret);
+			ret = starpu_task_wait(sync_task);
+			STARPU_ASSERT(ret == 0);
+		}
+		else
+		{
+			starpu_task_destroy(sync_task);
+		}
 	}
 	else
 	{

+ 2 - 2
src/core/dependencies/implicit_data_deps.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2010-2012,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2010,2011,2013,2015,2017,2018            CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,7 +21,7 @@
 #include <starpu.h>
 #include <common/config.h>
 
-struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, int *submit_pre_sync, struct starpu_task *post_sync_task, struct _starpu_task_wrapper_dlist *post_sync_task_dependency_slot,
 								  starpu_data_handle_t handle, enum starpu_data_access_mode mode, unsigned task_handle_sequential_consistency);
 int _starpu_test_implicit_data_deps_with_handle(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);

+ 6 - 6
src/core/dependencies/tags.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2014,2016-2018                      Université de Bordeaux
+ * Copyright (C) 2008-2014,2016-2018,2020                 Université de Bordeaux
  * Copyright (C) 2011,2012,2016                           Inria
  * Copyright (C) 2010-2013,2015-2017,2019                 CNRS
  *
@@ -76,16 +76,16 @@ static struct _starpu_cg *create_cg_tag(unsigned ntags, struct _starpu_tag *tag)
 static struct _starpu_tag *_starpu_tag_init(starpu_tag_t id)
 {
 	struct _starpu_tag *tag;
-	_STARPU_MALLOC(tag, sizeof(struct _starpu_tag));
+	_STARPU_CALLOC(tag, 1, sizeof(struct _starpu_tag));
 
-	tag->job = NULL;
-	tag->is_assigned = 0;
-	tag->is_submitted = 0;
+	//tag->job = NULL;
+	//tag->is_assigned = 0;
+	//tag->is_submitted = 0;
 
 	tag->id = id;
 	tag->state = STARPU_INVALID_STATE;
 
-	_starpu_cg_list_init(&tag->tag_successors);
+	_starpu_cg_list_init0(&tag->tag_successors);
 
 	_starpu_spin_init(&tag->lock);
 

+ 5 - 7
src/core/jobs.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -70,11 +70,9 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 	struct _starpu_job *job;
         _STARPU_LOG_IN();
 
-	_STARPU_MALLOC(job, sizeof(*job));
-
 	/* As most of the fields must be initialized at NULL, let's put 0
 	 * everywhere */
-	memset(job, 0, sizeof(*job));
+	_STARPU_CALLOC(job, 1, sizeof(*job));
 
 	if (task->dyn_handles)
 	{
@@ -99,10 +97,10 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 			maxnjobs = jobs;
 	}
 
-	_starpu_cg_list_init(&job->job_successors);
+	_starpu_cg_list_init0(&job->job_successors);
 
-	STARPU_PTHREAD_MUTEX_INIT(&job->sync_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&job->sync_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&job->sync_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT0(&job->sync_cond, NULL);
 
 	/* By default we have sequential tasks */
 	job->task_size = 1;

+ 8 - 8
src/core/perfmodel/perfmodel_history.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014,2016,2017                      Inria
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -582,7 +582,7 @@ static void parse_per_arch_model_file(FILE *f, const char *path, struct starpu_p
 			 * good-enough estimation */
 			STARPU_HG_DISABLE_CHECKING(entry->nsample);
 			STARPU_HG_DISABLE_CHECKING(entry->mean);
-			entry->nerror = 0;
+			//entry->nerror = 0;
 		}
 
 		scan_history_entry(f, path, entry);
@@ -1886,18 +1886,18 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 				/* Do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				entry->mean = 0;
-				entry->sum = 0;
+				//entry->mean = 0;
+				//entry->sum = 0;
 
-				entry->deviation = 0.0;
-				entry->sum2 = 0;
+				//entry->deviation = 0.0;
+				//entry->sum2 = 0;
 
 				entry->size = _starpu_job_get_data_size(model, arch, impl, j);
 				entry->flops = j->task->flops;
 
 				entry->footprint = key;
-				entry->nsample = 0;
-				entry->nerror = 0;
+				//entry->nsample = 0;
+				//entry->nerror = 0;
 
 				insert_history_entry(entry, list, &per_arch_model->history);
 			}

+ 36 - 9
src/core/simgrid.c

@@ -38,6 +38,12 @@
 #ifdef STARPU_HAVE_SIMGRID_HOST_H
 #include <simgrid/host.h>
 #endif
+#ifdef STARPU_HAVE_SIMGRID_ENGINE_H
+#include <simgrid/engine.h>
+#endif
+#ifdef STARPU_HAVE_XBT_CONFIG_H
+#include <xbt/config.h>
+#endif
 #include <smpi/smpi.h>
 
 #pragma weak starpu_main
@@ -137,9 +143,9 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 		char name[32];
 		STARPU_ASSERT(starpu_mpi_world_rank);
 		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%d", starpu_mpi_world_rank());
-#if defined(HAVE_MSG_ZONE_GET_HOSTS) || defined(MSG_zone_get_hosts)
+#if defined(HAVE_MSG_ZONE_GET_HOSTS) || defined(HAVE_SG_ZONE_GET_HOSTS) || defined(MSG_zone_get_hosts) || defined(sg_zone_get_hosts)
 		hosts = xbt_dynar_new(sizeof(sg_host_t), NULL);
-#  if defined(HAVE_SG_ZONE_GET_BY_NAME) || defined(sg_zone_get_by_name)
+#  if defined(HAVE_SG_ZONE_GET_HOSTS) || defined(sg_zone_get_hosts)
 		sg_zone_get_hosts(_starpu_simgrid_get_as_by_name(name), hosts);
 #  else
 		MSG_zone_get_hosts(_starpu_simgrid_get_as_by_name(name), hosts);
@@ -280,7 +286,11 @@ void _starpu_start_simgrid(int *argc, char **argv)
 
 	simgrid_started = 1;
 
+#if defined(STARPU_SIMGRID_HAVE_SIMGRID_INIT) && defined(HAVE_SG_ACTOR_INIT)
+	simgrid_init(argc, argv);
+#else
 	MSG_init(argc, argv);
+#endif
 	/* Simgrid uses tiny stacks by default.  This comes unexpected to our users.  */
 	unsigned stack_size = 8192;
 #ifdef HAVE_GETRLIMIT
@@ -304,7 +314,11 @@ void _starpu_start_simgrid(int *argc, char **argv)
 #else
 	_starpu_simgrid_get_platform_path(4, path, sizeof(path));
 #endif
+#if defined(STARPU_SIMGRID_HAVE_SIMGRID_INIT) && defined(HAVE_SG_ACTOR_INIT)
+	simgrid_load_platform(path);
+#else
 	MSG_create_environment(path);
+#endif
 
 	simgrid_transfer_cost = starpu_get_env_number_default("STARPU_SIMGRID_TRANSFER_COST", 1);
 }
@@ -378,14 +392,22 @@ int main(int argc, char **argv)
 	_starpu_simgrid_actor_create("main", &do_starpu_main, _starpu_simgrid_get_host_by_name("MAIN"), argc, argv_cpy);
 
 	/* And run maestro in the main thread */
+#if defined(STARPU_SIMGRID_HAVE_SIMGRID_INIT) && defined(HAVE_SG_ACTOR_INIT)
+	simgrid_run();
+#else
 	MSG_main();
+#endif
 	return main_ret;
 }
 
-#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach)
+#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach) || defined(HAVE_SG_ACTOR_ATTACH)
 static void maestro(void *data STARPU_ATTRIBUTE_UNUSED)
 {
+#if defined(STARPU_SIMGRID_HAVE_SIMGRID_INIT) && defined(HAVE_SG_ACTOR_INIT)
+	simgrid_run();
+#else
 	MSG_main();
+#endif
 }
 #endif
 
@@ -721,7 +743,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
-#ifdef HAVE_SG_HOST_SEND_TO
+#if defined(HAVE_SG_HOST_SEND_TO) || defined(HAVE_SG_HOST_SENDTO)
 	size_t size;
 #else
 	msg_task_t task;
@@ -854,15 +876,20 @@ static void *transfer_execute(void *arg)
 		if (t->last_transfer == transfer)
 			t->last_transfer = NULL;
 
-#ifdef HAVE_SG_HOST_SEND_TO
+#if defined(HAVE_SG_HOST_SEND_TO) || defined(HAVE_SG_HOST_SENDTO)
 		if (transfer->size)
 #else
 		if (transfer->task)
 #endif
 		{
 			_STARPU_DEBUG("transfer %p started\n", transfer);
-#ifdef HAVE_SG_HOST_SEND_TO
-			sg_host_send_to(_starpu_simgrid_memory_node_get_host(transfer->src_node),
+#if defined(HAVE_SG_HOST_SEND_TO) || defined(HAVE_SG_HOST_SENDTO)
+#ifdef HAVE_SG_HOST_SENDTO
+			sg_host_sendto
+#else
+			sg_host_send_to
+#endif
+				(_starpu_simgrid_memory_node_get_host(transfer->src_node),
 					_starpu_simgrid_memory_node_get_host(transfer->dst_node),
 					transfer->size);
 #else
@@ -963,7 +990,7 @@ static void _starpu_simgrid_wait_transfers(void)
 	struct transfer *sync = transfer_new();
 	struct transfer *cur;
 
-#ifdef HAVE_SG_HOST_SEND_TO
+#if defined(HAVE_SG_HOST_SEND_TO) || defined(HAVE_SG_HOST_SENDTO)
 	sync->size = 0;
 #else
 	sync->task = NULL;
@@ -1031,7 +1058,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
 	_STARPU_DEBUG("creating transfer %p for %lu bytes\n", transfer, (unsigned long) size);
 
-#ifdef HAVE_SG_HOST_SEND_TO
+#if defined(HAVE_SG_HOST_SEND_TO) || defined(HAVE_SG_HOST_SENDTO)
 	transfer->size = size;
 #else
 	msg_task_t task;

+ 15 - 2
src/core/simgrid.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016,2017                                Inria
  * Copyright (C) 2013,2017                                CNRS
- * Copyright (C) 2012-2019                                Université de Bordeaux
+ * Copyright (C) 2012-2020                                Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,10 +27,23 @@ extern "C"
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #include <simgrid/msg.h>
-#else
+#elif defined(STARPU_HAVE_MSG_MSG_H)
 #include <msg/msg.h>
 #endif
 
+#ifdef STARPU_HAVE_XBT_BASE_H
+#include <xbt/base.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_VERSION_H
+#include <simgrid/version.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_ZONE_H
+#include <simgrid/zone.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+#include <simgrid/host.h>
+#endif
+
 #include <xbt/xbt_os_time.h>
 
 struct _starpu_pthread_args

+ 28 - 9
src/core/simgrid_cpp.cpp

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                CNRS
- * Copyright (C) 2012-2019                                Université de Bordeaux
+ * Copyright (C) 2012-2020                                Université de Bordeaux
  * Copyright (C) 2016,2017                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,19 +21,12 @@
 #include <common/config.h>
 
 #ifdef STARPU_SIMGRID
-#ifdef STARPU_HAVE_SIMGRID_MSG_H
-#include <simgrid/msg.h>
-#else
-#include <msg/msg.h>
-#endif
 #if SIMGRID_VERSION >= 32190
 #include <simgrid/simix.hpp>
 #else
 #include <simgrid/simix.h>
 #endif
-#ifdef STARPU_HAVE_SIMGRID_HOST_H
-#include <simgrid/host.h>
-#endif
+#include <smpi/smpi.h>
 
 /* thread_create function which implements inheritence of MPI privatization */
 /* See https://github.com/simgrid/simgrid/issues/139 */
@@ -42,9 +35,26 @@ typedef struct
 {
 	void_f_pvoid_t code;
 	void *userparam;
+#if SIMGRID_VERSION < 32501
 	void *father_data;
+#endif
 } thread_data_t;
 
+#if SIMGRID_VERSION >= 32501
+static void *_starpu_simgrid_xbt_thread_create_wrapper(void *arg)
+{
+	thread_data_t *t = (thread_data_t *) arg;
+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
+	starpu_sleep(0.000001);
+#ifdef HAVE_SMPI_THREAD_CREATE
+	/* Make this actor inherit SMPI data from father actor */
+	SMPI_thread_create();
+#endif
+	t->code(t->userparam);
+	free(t);
+	return NULL;
+}
+#else
 #if SIMGRID_VERSION >= 32190
 static void _starpu_simgrid_xbt_thread_create_wrapper(void)
 #else
@@ -74,9 +84,17 @@ static int _starpu_simgrid_xbt_thread_create_wrapper(int argc STARPU_ATTRIBUTE_U
 	return 0;
 #endif
 }
+#endif
 
 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, void *param)
 {
+#if SIMGRID_VERSION >= 32501
+	starpu_pthread_t t;
+	thread_data_t *res = (thread_data_t *) malloc(sizeof(thread_data_t));
+	res->userparam = param;
+	res->code = code;
+	starpu_pthread_create_on(name, &t, NULL, _starpu_simgrid_xbt_thread_create_wrapper, res, sg_host_self());
+#else
 #if SIMGRID_VERSION >= 32190 || defined(HAVE_SIMCALL_PROCESS_CREATE) || defined(simcall_process_create)
 #ifdef HAVE_SMX_ACTOR_T
 	smx_actor_t process STARPU_ATTRIBUTE_UNUSED;
@@ -122,6 +140,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, vo
 #else
 	STARPU_ABORT_MSG("Can't run StarPU-Simgrid-MPI with a Simgrid version which does not provide simcall_process_create and does not fix https://github.com/simgrid/simgrid/issues/139 , sorry.");
 #endif
+#endif
 }
 
 #endif

+ 27 - 11
src/core/task.c

@@ -1,9 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2019                                Inria
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2017                                     Erwan Leria
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
@@ -245,6 +245,12 @@ static int limit_max_submitted_tasks;
 static int watchdog_crash;
 static int watchdog_delay;
 
+/*
+ * Function to call when watchdog detects that no task has finished for more than STARPU_WATCHDOG_TIMEOUT seconds
+ */
+static void (*watchdog_hook)(void *) = NULL;
+static void * watchdog_hook_arg = NULL;
+
 #define _STARPU_TASK_MAGIC 42
 
 /* Called once at starpu_init */
@@ -788,7 +794,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 					  "Codelet %p has too many buffers (%d vs max %d). Either use --enable-maxbuffers configure option to increase the max, or use dyn_handles instead of handles.",
 					  task->cl, STARPU_TASK_GET_NBUFFERS(task), STARPU_NMAXBUFS);
 
-		if (task->dyn_handles)
+		if (STARPU_UNLIKELY(task->dyn_handles))
 		{
 			_STARPU_MALLOC(task->dyn_interfaces, nbuffers * sizeof(void *));
 		}
@@ -821,7 +827,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 		}
 
 		/* Check the type of worker(s) required by the task exist */
-		if (!_starpu_worker_exists(task))
+		if (STARPU_UNLIKELY(!_starpu_worker_exists(task)))
 		{
 			_STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
@@ -830,7 +836,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 		/* In case we require that a task should be explicitely
 		 * executed on a specific worker, we make sure that the worker
 		 * is able to execute this task.  */
-		if (task->execute_on_a_specific_worker && !starpu_combined_worker_can_execute_task(task->workerid, task, 0))
+		if (STARPU_UNLIKELY(task->execute_on_a_specific_worker && !starpu_combined_worker_can_execute_task(task->workerid, task, 0)))
 		{
 			_STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
@@ -932,7 +938,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 			_starpu_detect_implicit_data_deps(task);
 	}
 
-	if (bundle)
+	if (STARPU_UNLIKELY(bundle))
 	{
 		/* We need to make sure that models for other tasks of the
 		 * bundle are also loaded, so the scheduler can estimate the
@@ -967,7 +973,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	 * dependency. */
 	task->status = STARPU_TASK_BLOCKED;
 
-	if (profiling)
+	if (STARPU_UNLIKELY(profiling))
 		_starpu_clock_gettime(&info->submit_time);
 
 	ret = _starpu_submit_job(j, nodeps);
@@ -1563,14 +1569,18 @@ static void *watchdog_func(void *arg)
 		if (!config->watchdog_ok && last_nsubmitted
 				&& last_nsubmitted == starpu_task_nsubmitted())
 		{
-			_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n",
-				    timeout);
+			if (watchdog_hook == NULL)
+				_STARPU_MSG("The StarPU watchdog detected that no task finished for %fs (can be configured through STARPU_WATCHDOG_TIMEOUT)\n",
+									timeout);
+			else
+				watchdog_hook(watchdog_hook_arg);
+
 			if (watchdog_crash)
 			{
 				_STARPU_MSG("Crashing the process\n");
 				raise(SIGABRT);
 			}
-			else
+			else if (watchdog_hook == NULL)
 				_STARPU_MSG("Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
 		}
 		/* Only shout again after another period */
@@ -1580,7 +1590,13 @@ static void *watchdog_func(void *arg)
 	return NULL;
 }
 
-void _starpu_watchdog_init(void)
+void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg)
+{
+	watchdog_hook = hook;
+	watchdog_hook_arg = hook_arg;
+}
+
+void _starpu_watchdog_init()
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	char *timeout_env = starpu_getenv("STARPU_WATCHDOG_TIMEOUT");

+ 5 - 5
src/core/task_bundle.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2014                                Université de Bordeaux
+ * Copyright (C) 2011-2014, 2020                                Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2011,2013,2015-2017                      CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -29,15 +29,15 @@
 /* Initialize a task bundle */
 void starpu_task_bundle_create(starpu_task_bundle_t *bundle)
 {
-	_STARPU_MALLOC(*bundle, sizeof(struct _starpu_task_bundle));
+	_STARPU_CALLOC(*bundle, 1, sizeof(struct _starpu_task_bundle));
 
-	STARPU_PTHREAD_MUTEX_INIT(&(*bundle)->mutex, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&(*bundle)->mutex, NULL);
 	/* Of course at the beginning a bundle is open,
 	 * user can insert and remove tasks from it */
-	(*bundle)->closed = 0;
+	//(*bundle)->closed = 0;
 
 	/* Start with an empty list */
-	(*bundle)->list = NULL;
+	//(*bundle)->list = NULL;
 
 }
 

+ 11 - 5
src/core/topology.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2010-2017, 2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2016                                     Uppsala University
@@ -21,6 +21,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <common/config.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
 #include <core/workers.h>
 #include <core/debug.h>
 #include <core/topology.h>
@@ -2030,12 +2033,15 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 			 (previous >= 0 && previous == workerid) ||
 			 (name && cpu_name[cpuid] && !strcmp(name, cpu_name[cpuid])) ) )
 		{
+			char hostname[65];
+			gethostname(hostname, sizeof(hostname));
+
 			if (previous == STARPU_ACTIVETHREAD)
-				_STARPU_DISP("Warning: active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else if (previous == STARPU_NONACTIVETHREAD)
-				_STARPU_DISP("Warning: non-active thread %s was already bound to PU %d\n", cpu_name[cpuid], cpuid);
+				_STARPU_DISP("[%s] Warning: non-active thread %s was already bound to PU %d\n", hostname, cpu_name[cpuid], cpuid);
 			else
-				_STARPU_DISP("Warning: worker %d was already bound to PU %d\n", previous, cpuid);
+				_STARPU_DISP("[%s] Warning: worker %d was already bound to PU %d\n", hostname, previous, cpuid);
 
 			if (workerid == STARPU_ACTIVETHREAD)
 				_STARPU_DISP("and we were told to also bind active thread %s to it.\n", name);
@@ -2048,7 +2054,7 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
 			if (workerid >= 0)
 				/* This shouldn't happen for workers */
-				_STARPU_DISP("Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", config->topology.nhwcpus, config->topology.nhwpus);
+				_STARPU_DISP("[%s] Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", hostname, config->topology.nhwcpus, config->topology.nhwpus);
 			ret = -1;
 		}
 		else

+ 2 - 2
src/core/workers.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2017,2019                           Inria
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2016                                     Uppsala University
@@ -1202,7 +1202,7 @@ int _starpu_get_catch_signals(void);
 static inline int _starpu_perf_counter_paused(void) 
 {
 	STARPU_RMB();
-	return _starpu_config.perf_counter_pause_depth > 0;
+	return STARPU_UNLIKELY(_starpu_config.perf_counter_pause_depth > 0);
 }
 
 /* @}*/

+ 49 - 38
src/datawizard/filters.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2016,2017                      Inria
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015-2019                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -228,24 +228,29 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		else
 			ops = initial_handle->ops;
 
+		/* As most of the fields must be initialized at NULL, let's put
+		 * 0 everywhere */
+		memset(child, 0, sizeof(*child));
 		_starpu_data_handle_init(child, ops, initial_handle->mf_node);
 
-		child->nchildren = 0;
-		child->nplans = 0;
-		child->switch_cl = NULL;
-		child->partitioned = 0;
-		child->readonly = 0;
+		//child->nchildren = 0;
+		//child->nplans = 0;
+		//child->switch_cl = NULL;
+		//child->partitioned = 0;
+		//child->readonly = 0;
 		child->active = inherit_state;
-		child->active_ro = 0;
-                child->mpi_data = NULL;
+		//child->active_ro = 0;
+                //child->mpi_data = NULL;
 		child->root_handle = initial_handle->root_handle;
 		child->father_handle = initial_handle;
-		child->active_children = NULL;
-		child->active_readonly_children = NULL;
-		child->nactive_readonly_children = 0;
+		//child->active_children = NULL;
+		//child->active_readonly_children = NULL;
+		//child->nactive_readonly_children = 0;
 		child->nsiblings = nparts;
 		if (inherit_state)
-			child->siblings = NULL;
+		{
+			//child->siblings = NULL;
+		}
 		else
 			child->siblings = childrenp;
 		child->sibling_index = i;
@@ -258,31 +263,31 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		/* initialize the chunk lock */
 		_starpu_data_requester_prio_list_init(&child->req_list);
 		_starpu_data_requester_prio_list_init(&child->reduction_req_list);
-		child->reduction_tmp_handles = NULL;
-		child->write_invalidation_req = NULL;
-		child->refcnt = 0;
-		child->unlocking_reqs = 0;
-		child->busy_count = 0;
-		child->busy_waiting = 0;
-		STARPU_PTHREAD_MUTEX_INIT(&child->busy_mutex, NULL);
-		STARPU_PTHREAD_COND_INIT(&child->busy_cond, NULL);
-		child->reduction_refcnt = 0;
+		//child->reduction_tmp_handles = NULL;
+		//child->write_invalidation_req = NULL;
+		//child->refcnt = 0;
+		//child->unlocking_reqs = 0;
+		//child->busy_count = 0;
+		//child->busy_waiting = 0;
+		STARPU_PTHREAD_MUTEX_INIT0(&child->busy_mutex, NULL);
+		STARPU_PTHREAD_COND_INIT0(&child->busy_cond, NULL);
+		//child->reduction_refcnt = 0;
 		_starpu_spin_init(&child->header_lock);
 
 		child->sequential_consistency = initial_handle->sequential_consistency;
 		child->initialized = initial_handle->initialized;
 		child->ooc = initial_handle->ooc;
 
-		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
+		//STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		child->last_submitted_mode = STARPU_R;
-		child->last_sync_task = NULL;
-		child->last_submitted_accessors.task = NULL;
+		//child->last_sync_task = NULL;
+		//child->last_submitted_accessors.task = NULL;
 		child->last_submitted_accessors.next = &child->last_submitted_accessors;
 		child->last_submitted_accessors.prev = &child->last_submitted_accessors;
-		child->post_sync_tasks = NULL;
+		//child->post_sync_tasks = NULL;
 		/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
 		STARPU_HG_DISABLE_CHECKING(child->post_sync_tasks_cnt);
-		child->post_sync_tasks_cnt = 0;
+		//child->post_sync_tasks_cnt = 0;
 
 		/* The methods used for reduction are propagated to the
 		 * children. */
@@ -290,17 +295,19 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		child->init_cl = initial_handle->init_cl;
 
 #ifdef STARPU_USE_FXT
-		child->last_submitted_ghost_sync_id_is_valid = 0;
-		child->last_submitted_ghost_sync_id = 0;
-		child->last_submitted_ghost_accessors_id = NULL;
+		//child->last_submitted_ghost_sync_id_is_valid = 0;
+		//child->last_submitted_ghost_sync_id = 0;
+		//child->last_submitted_ghost_accessors_id = NULL;
 #endif
 
 		if (_starpu_global_arbiter)
 			/* Just for testing purpose */
 			starpu_data_assign_arbiter(child, _starpu_global_arbiter);
 		else
-			child->arbiter = NULL;
-		_starpu_data_requester_prio_list_init(&child->arbitered_req_list);
+		{
+			//child->arbiter = NULL;
+		}
+		_starpu_data_requester_prio_list_init0(&child->arbitered_req_list);
 
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
@@ -317,16 +324,20 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 			if (inherit_state || !initial_replicate->automatically_allocated)
 				child_replicate->allocated = initial_replicate->allocated;
 			else
-				child_replicate->allocated = 0;
+			{
+				//child_replicate->allocated = 0;
+			}
 			/* Do not allow memory reclaiming within the child for parent bits */
-			child_replicate->automatically_allocated = 0;
-			child_replicate->refcnt = 0;
+			//child_replicate->automatically_allocated = 0;
+			//child_replicate->refcnt = 0;
 			child_replicate->memory_node = node;
-			child_replicate->relaxed_coherency = 0;
+			//child_replicate->relaxed_coherency = 0;
 			if (inherit_state)
 				child_replicate->initialized = initial_replicate->initialized;
 			else
-				child_replicate->initialized = 0;
+			{
+				//child_replicate->initialized = 0;
+			}
 
 			/* update the interface */
 			void *initial_interface = starpu_data_get_interface_on_node(initial_handle, node);
@@ -336,8 +347,8 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 			f->filter_func(initial_interface, child_interface, f, i, nparts);
 		}
 
-		child->per_worker = NULL;
-		child->user_data = NULL;
+		//child->per_worker = NULL;
+		//child->user_data = NULL;
 
 		/* We compute the size and the footprint of the child once and
 		 * store it in the handle */

+ 2 - 1
src/datawizard/interfaces/bcsr_filters.c

@@ -87,7 +87,8 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 	if (bcsr_father->nzval)
 	{
 		uint8_t *nzval = (uint8_t *)(bcsr_father->nzval);
-		matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
+		matrix_child->dev_handle = matrix_child->ptr = (uintptr_t)&nzval[firstentry + ptr_offset];
+		matrix_child->offset = 0;
 	}
 }
 

+ 53 - 51
src/datawizard/interfaces/data_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -262,69 +262,69 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	STARPU_ASSERT(handle);
 
 	/* initialize the new lock */
-	_starpu_data_requester_prio_list_init(&handle->req_list);
-	handle->refcnt = 0;
-	handle->unlocking_reqs = 0;
-	handle->busy_count = 0;
-	handle->busy_waiting = 0;
-	STARPU_PTHREAD_MUTEX_INIT(&handle->busy_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&handle->busy_cond, NULL);
+	_starpu_data_requester_prio_list_init0(&handle->req_list);
+	//handle->refcnt = 0;
+	//handle->unlocking_reqs = 0;
+	//handle->busy_count = 0;
+	//handle->busy_waiting = 0;
+	STARPU_PTHREAD_MUTEX_INIT0(&handle->busy_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT0(&handle->busy_cond, NULL);
 	_starpu_spin_init(&handle->header_lock);
 
 	/* first take care to properly lock the data */
 	_starpu_spin_lock(&handle->header_lock);
 
 	/* there is no hierarchy yet */
-	handle->nchildren = 0;
-	handle->nplans = 0;
-	handle->switch_cl = NULL;
-	handle->partitioned = 0;
-	handle->readonly = 0;
+	//handle->nchildren = 0;
+	//handle->nplans = 0;
+	//handle->switch_cl = NULL;
+	//handle->partitioned = 0;
+	//handle->readonly = 0;
 	handle->active = 1;
-	handle->active_ro = 0;
+	//handle->active_ro = 0;
 	handle->root_handle = handle;
-	handle->father_handle = NULL;
-	handle->active_children = NULL;
-	handle->active_readonly_children = NULL;
-	handle->nactive_readonly_children = 0;
-	handle->nsiblings = 0;
-	handle->siblings = NULL;
-	handle->sibling_index = 0; /* could be anything for the root */
+	//handle->father_handle = NULL;
+	//handle->active_children = NULL;
+	//handle->active_readonly_children = NULL;
+	//handle->nactive_readonly_children = 0;
+	//handle->nsiblings = 0;
+	//handle->siblings = NULL;
+	//handle->sibling_index = 0; /* could be anything for the root */
 	handle->depth = 1; /* the tree is just a node yet */
-        handle->mpi_data = NULL; /* invalid until set */
+        //handle->mpi_data = NULL; /* invalid until set */
 
-	handle->is_not_important = 0;
+	//handle->is_not_important = 0;
 
 	handle->sequential_consistency =
 		starpu_data_get_default_sequential_consistency_flag();
 	handle->initialized = home_node != -1;
 	handle->ooc = 1;
 
-	STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&handle->sequential_consistency_mutex, NULL);
 	handle->last_submitted_mode = STARPU_R;
-	handle->last_sync_task = NULL;
-	handle->last_submitted_accessors.task = NULL;
+	//handle->last_sync_task = NULL;
+	//handle->last_submitted_accessors.task = NULL;
 	handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
 	handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
-	handle->post_sync_tasks = NULL;
+	//handle->post_sync_tasks = NULL;
 
 	/* Tell helgrind that the race in _starpu_unlock_post_sync_tasks is fine */
 	STARPU_HG_DISABLE_CHECKING(handle->post_sync_tasks_cnt);
-	handle->post_sync_tasks_cnt = 0;
+	//handle->post_sync_tasks_cnt = 0;
 
 	/* By default, there are no methods available to perform a reduction */
-	handle->redux_cl = NULL;
-	handle->init_cl = NULL;
+	//handle->redux_cl = NULL;
+	//handle->init_cl = NULL;
 
-	handle->reduction_refcnt = 0;
-	_starpu_data_requester_prio_list_init(&handle->reduction_req_list);
-	handle->reduction_tmp_handles = NULL;
-	handle->write_invalidation_req = NULL;
+	//handle->reduction_refcnt = 0;
+	_starpu_data_requester_prio_list_init0(&handle->reduction_req_list);
+	//handle->reduction_tmp_handles = NULL;
+	//handle->write_invalidation_req = NULL;
 
 #ifdef STARPU_USE_FXT
-	handle->last_submitted_ghost_sync_id_is_valid = 0;
-	handle->last_submitted_ghost_sync_id = 0;
-	handle->last_submitted_ghost_accessors_id = NULL;
+	//handle->last_submitted_ghost_sync_id_is_valid = 0;
+	//handle->last_submitted_ghost_sync_id = 0;
+	//handle->last_submitted_ghost_accessors_id = NULL;
 #endif
 
 	handle->wt_mask = wt_mask;
@@ -339,8 +339,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		/* Just for testing purpose */
 		starpu_data_assign_arbiter(handle, _starpu_global_arbiter);
 	else
-		handle->arbiter = NULL;
-	_starpu_data_requester_prio_list_init(&handle->arbitered_req_list);
+	{
+		//handle->arbiter = NULL;
+	}
+	_starpu_data_requester_prio_list_init0(&handle->arbitered_req_list);
 	handle->last_locality = -1;
 
 	/* that new data is invalid from all nodes perpective except for the
@@ -352,28 +354,28 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		replicate = &handle->per_node[node];
 
 		replicate->memory_node = node;
-		replicate->relaxed_coherency = 0;
-		replicate->refcnt = 0;
+		//replicate->relaxed_coherency = 0;
+		//replicate->refcnt = 0;
 
 		if ((int) node == home_node)
 		{
 			/* this is the home node with the only valid copy */
 			replicate->state = STARPU_OWNER;
 			replicate->allocated = 1;
-			replicate->automatically_allocated = 0;
+			//replicate->automatically_allocated = 0;
 			replicate->initialized = 1;
 		}
 		else
 		{
 			/* the value is not available here yet */
 			replicate->state = STARPU_INVALID;
-			replicate->allocated = 0;
-			replicate->initialized = 0;
+			//replicate->allocated = 0;
+			//replicate->initialized = 0;
 		}
 	}
 
-	handle->per_worker = NULL;
-	handle->user_data = NULL;
+	//handle->per_worker = NULL;
+	//handle->user_data = NULL;
 
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
@@ -451,8 +453,8 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 	handle->magic = 42;
 	handle->ops = interface_ops;
 	handle->mf_node = mf_node;
-	handle->mpi_data = NULL;
-	handle->partition_automatic_disabled = 0;
+	//handle->mpi_data = NULL;
+	//handle->partition_automatic_disabled = 0;
 
 	size_t interfacesize = interface_ops->interface_size;
 
@@ -761,12 +763,12 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 		int home_node = handle->home_node;
 		if (home_node >= 0)
 		{
-			struct _starpu_unregister_callback_arg arg;
+			struct _starpu_unregister_callback_arg arg = { 0 };
 			arg.handle = handle;
 			arg.memory_node = (unsigned)home_node;
 			arg.terminated = 0;
-			STARPU_PTHREAD_MUTEX_INIT(&arg.mutex, NULL);
-			STARPU_PTHREAD_COND_INIT(&arg.cond, NULL);
+			STARPU_PTHREAD_MUTEX_INIT0(&arg.mutex, NULL);
+			STARPU_PTHREAD_COND_INIT0(&arg.cond, NULL);
 
 			if (!_starpu_attempt_to_submit_data_request_from_apps(handle, STARPU_R,
 					_starpu_data_unregister_fetch_data_callback, &arg))

+ 6 - 6
src/datawizard/memstats.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012,2015-2017                      CNRS
- * Copyright (C) 2009,2010,2012,2014                      Université de Bordeaux
+ * Copyright (C) 2009,2010,2012,2014,2020                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,11 +31,11 @@ void _starpu_memory_stats_init_per_node(starpu_data_handle_t handle STARPU_ATTRI
 {
 #ifdef STARPU_MEMORY_STATS
 	/* Stats initilization */
-	handle->memory_stats->direct_access[node]=0;
-	handle->memory_stats->loaded_shared[node]=0;
-	handle->memory_stats->shared_to_owner[node]=0;
-	handle->memory_stats->loaded_owner[node]=0;
-	handle->memory_stats->invalidated[node]=0;
+	//handle->memory_stats->direct_access[node]=0;
+	//handle->memory_stats->loaded_shared[node]=0;
+	//handle->memory_stats->shared_to_owner[node]=0;
+	//handle->memory_stats->loaded_owner[node]=0;
+	//handle->memory_stats->invalidated[node]=0;
 #endif
 }
 

+ 33 - 15
src/datawizard/user_interactions.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2017                           Inria
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -85,9 +85,9 @@ static inline void _starpu_data_acquire_wrapper_init(struct user_interaction_wra
 	wrapper->handle = handle;
 	wrapper->node = node;
 	wrapper->mode = mode;
-	wrapper->finished = 0;
-	STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
+	//wrapper->finished = 0;
+	STARPU_PTHREAD_COND_INIT0(&wrapper->cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&wrapper->lock, NULL);
 }
 
 /* Called to signal completion of asynchronous data acquisition */
@@ -216,6 +216,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 	{
 		struct starpu_task *new_task;
 		struct _starpu_job *pre_sync_job, *post_sync_job;
+		int submit_pre_sync = 0;
 		wrapper->pre_sync_task = starpu_task_create();
 		wrapper->pre_sync_task->name = "_starpu_data_acquire_cb_pre";
 		wrapper->pre_sync_task->detach = 1;
@@ -237,18 +238,26 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 		if (quick)
 			pre_sync_job->quick_next = post_sync_job;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, wrapper->post_sync_task, &_starpu_get_job_associated_to_task(wrapper->post_sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper->pre_sync_task, &submit_pre_sync, wrapper->post_sync_task, &_starpu_get_job_associated_to_task(wrapper->post_sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
-		if (new_task)
+		if (STARPU_UNLIKELY(new_task))
 		{
 			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 
-		/* TODO detect if this is superflous */
-		int ret = _starpu_task_submit_internally(wrapper->pre_sync_task);
-		STARPU_ASSERT(!ret);
+		if (submit_pre_sync)
+		{
+			int ret = _starpu_task_submit_internally(wrapper->pre_sync_task);
+			STARPU_ASSERT(!ret);
+		}
+		else
+		{
+			wrapper->pre_sync_task->detach = 0;
+			starpu_task_destroy(wrapper->pre_sync_task);
+			starpu_data_acquire_cb_pre_sync_callback(wrapper);
+		}
 	}
 	else
 	{
@@ -360,6 +369,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 	if (sequential_consistency)
 	{
 		struct starpu_task *new_task;
+		int submit_pre_sync = 0;
 		wrapper.pre_sync_task = starpu_task_create();
 		wrapper.pre_sync_task->name = "_starpu_data_acquire_pre";
 		wrapper.pre_sync_task->detach = 0;
@@ -370,18 +380,26 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 		wrapper.post_sync_task->detach = 1;
 		wrapper.post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
 
-		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, &_starpu_get_job_associated_to_task(wrapper.post_sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, &submit_pre_sync, wrapper.post_sync_task, &_starpu_get_job_associated_to_task(wrapper.post_sync_task)->implicit_dep_slot, handle, mode, sequential_consistency);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
-		if (new_task)
+
+		if (STARPU_UNLIKELY(new_task))
 		{
 			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 
-		/* TODO detect if this is superflous */
-		wrapper.pre_sync_task->synchronous = 1;
-		int ret = _starpu_task_submit_internally(wrapper.pre_sync_task);
-		STARPU_ASSERT(!ret);
+		if (submit_pre_sync)
+		{
+			wrapper.pre_sync_task->synchronous = 1;
+			int ret = _starpu_task_submit_internally(wrapper.pre_sync_task);
+			STARPU_ASSERT(!ret);
+		}
+		else
+		{
+			wrapper.pre_sync_task->detach = 0;
+			starpu_task_destroy(wrapper.pre_sync_task);
+		}
 	}
 	else
 	{

+ 7 - 7
src/drivers/mpi/driver_mpi_source.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016,2017                                Inria
  * Copyright (C) 2017,2019                                CNRS
- * Copyright (C) 2017                                     Université de Bordeaux
+ * Copyright (C) 2017,2020                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -50,13 +50,13 @@ int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, u
 int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
 int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
 
-int _starpu_mpi_copy_data_from_mpi_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_mpi_copy_data_from_mpi_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_mpi_copy_data_from_cpu_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_copy_interface_from_mpi_to_cpu(starpu_interface_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_interface_request *req);
+int _starpu_mpi_copy_interface_from_mpi_to_mpi(starpu_interface_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_interface_request *req);
+int _starpu_mpi_copy_interface_from_cpu_to_mpi(starpu_interface_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_interface_request *req);
 
-int _starpu_mpi_copy_interface_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
-int _starpu_mpi_copy_interface_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
-int _starpu_mpi_copy_interface_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_data_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_data_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_data_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
 
 int _starpu_mpi_is_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags);

+ 2 - 2
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014                           Inria
- * Copyright (C) 2010-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2010-2017,2019-2020                      Université de Bordeaux
  * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -257,7 +257,7 @@ static void new_task(struct _starpu_job *j)
 /* A new task was submitted, record it */
 void _starpu_bound_record(struct _starpu_job *j)
 {
-	if (!_starpu_bound_recording)
+	if (STARPU_LIKELY(!_starpu_bound_recording))
 		return;
 
 	if (!good_job(j))

+ 2 - 2
src/util/openmp_runtime_support.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2014-2018                                Inria
  * Copyright (C) 2014-2017,2019                           CNRS
- * Copyright (C) 2015,2017,2019                           Université de Bordeaux
+ * Copyright (C) 2015,2017,2019-2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -319,7 +319,7 @@ static struct starpu_omp_region *create_omp_region_struct(struct starpu_omp_regi
 	_STARPU_CALLOC(region, 1, sizeof(*region));
 	region->parent_region = parent_region;
 	region->owner_device = owner_device;
-	starpu_omp_thread_list_init(&region->thread_list);
+	starpu_omp_thread_list_init0(&region->thread_list);
 
 	_starpu_spin_init(&region->lock);
 	_starpu_spin_init(&region->registered_handles_lock);

+ 11 - 1
tests/Makefile.am

@@ -32,6 +32,9 @@ EXTRA_DIST =					\
 	regression/profiles.in			\
 	regression/regression.sh.in		\
 	regression/profiles.build.only.in	\
+	microbenchs/tasks_data_overhead.sh	\
+	microbenchs/sync_tasks_data_overhead.sh	\
+	microbenchs/async_tasks_data_overhead.sh	\
 	microbenchs/tasks_size_overhead.sh	\
 	microbenchs/tasks_size_overhead_sched.sh	\
 	microbenchs/tasks_size_overhead_scheds.sh	\
@@ -396,11 +399,18 @@ examplebin_PROGRAMS = \
 	microbenchs/tasks_size_overhead		\
 	microbenchs/local_pingpong
 examplebin_SCRIPTS = \
+	microbenchs/tasks_data_overhead.sh \
+	microbenchs/sync_tasks_data_overhead.sh \
+	microbenchs/async_tasks_data_overhead.sh \
 	microbenchs/tasks_size_overhead.gp \
 	microbenchs/tasks_size_overhead.sh
 if !STARPU_SIMGRID
 if !STARPU_USE_MPI_MASTER_SLAVE
-TESTS += microbenchs/tasks_size_overhead_scheds.sh
+TESTS += \
+	microbenchs/tasks_data_overhead.sh \
+	microbenchs/sync_tasks_data_overhead.sh \
+	microbenchs/async_tasks_data_overhead.sh \
+	microbenchs/tasks_size_overhead_scheds.sh
 endif
 endif
 

+ 2 - 2
tests/cholesky/sched.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014,2019                      Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2019-2020                 Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -39,7 +39,7 @@ trace_sched()
 		do
 			echo "$iter / $maxiter"
 			 echo "$ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null"
-			 val=`$ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null`
+			 val=`$STARPU_LAUNCH $ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null`
 			 echo "$val" >> $filename
 		done
 	done

+ 3 - 3
tests/cholesky/sched_one_gpu.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011,2014,2019                      Université de Bordeaux
+# Copyright (C) 2009-2011,2014,2019-2020                 Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -50,7 +50,7 @@ trace_sched()
 		do
 			echo "$iter / $maxiter"
 			echo "$ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null"
-			val=`$ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null`
+			val=`$STARPU_LAUNCH $ROOTDIR/examples/cholesky/dw_cholesky $OPTIONS 2> /dev/null`
 			echo "$val" >> $filename
 			echo "$val"
 		done
@@ -67,7 +67,7 @@ mkdir -p $TIMINGDIR
 # calibrate
 for i in `seq 1 5` 
 do
-STARPU_SCHED="dm" $ROOTDIR/examples/cholesky/dw_cholesky -nblocks 16 -size 16384 2> /dev/null
+STARPU_SCHED="dm" $STARPU_LAUNCH $ROOTDIR/examples/cholesky/dw_cholesky -nblocks 16 -size 16384 2> /dev/null
 done
 
 for sched in $schedlist

+ 2 - 1
tests/cholesky_ctxs/evaluate_expression.sh

@@ -3,6 +3,7 @@
 #
 # Copyright (C) 2011                                     Inria
 # Copyright (C) 2012,2017                                CNRS
+# Copyright (C) 2020                                     Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +34,7 @@ for s in `seq 1 $nsamples`
 do
     echo "$ROOTDIR/examples/$BENCH_NAME $OPTIONS"
     
-    val=`$ROOTDIR/examples/$BENCH_NAME $OPTIONS`
+    val=`$STARPU_LAUNCH $ROOTDIR/examples/$BENCH_NAME $OPTIONS`
     
     echo "$val"
     

+ 24 - 24
tests/coverage/coverage.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010,2011,2014,2017                      Université de Bordeaux
+# Copyright (C) 2010,2011,2014,2017,2020                 Université de Bordeaux
 # Copyright (C) 2010,2011,2015,2017                      CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -29,109 +29,109 @@ apps()
 {
     if [ -f $exampledir/basic_examples/block ] ; then
 	echo "block opencl"
-	STARPU_NCUDA=0 STARPU_NCPUS=0 $exampledir/basic_examples/block
+	STARPU_NCUDA=0 STARPU_NCPUS=0 $STARPU_LAUNCH $exampledir/basic_examples/block
 	check_success $?
     fi
 
     if [ -f $exampledir/basic_examples/variable ] ; then
 	echo "variable opencl"
-	STARPU_NCUDA=0 STARPU_NCPUS=0 $exampledir/basic_examples/variable 100
+	STARPU_NCUDA=0 STARPU_NCPUS=0 $STARPU_LAUNCH $exampledir/basic_examples/variable 100
 	check_success $?
 
 	echo "variable no worker"
-	STARPU_NCUDA=0 STARPU_NOPENCL=0 STARPU_NCPUS=0 $exampledir/basic_examples/variable
+	STARPU_NCUDA=0 STARPU_NOPENCL=0 STARPU_NCPUS=0 $STARPU_LAUNCH $exampledir/basic_examples/variable
 	check_success $?
     fi
 
     if [ -f $exampledir/incrementer/incrementer ] ; then
 	echo "incrementer opencl"
-	STARPU_NCUDA=0 STARPU_NCPUS=0 $exampledir/incrementer/incrementer 10
+	STARPU_NCUDA=0 STARPU_NCPUS=0 $STARPU_LAUNCH $exampledir/incrementer/incrementer 10
 	check_success $?
 
 	echo "incrementer no worker"
-	STARPU_NCUDA=0 STARPU_NOPENCL=0 STARPU_NCPUS=0 $exampledir/incrementer/incrementer
+	STARPU_NCUDA=0 STARPU_NOPENCL=0 STARPU_NCPUS=0 $STARPU_LAUNCH $exampledir/incrementer/incrementer
 	check_success $?
     fi
 
     if [ -f $exampledir/tag_example/tag_example ] ; then
 	echo "tag_example"
-	$exampledir/tag_example/tag_example -iter 64 -i 128 -j 24
+	$STARPU_LAUNCH $exampledir/tag_example/tag_example -iter 64 -i 128 -j 24
 	check_success $?
     fi
 
     if [ -f $exampledir/tag_example/tag_example2 ] ; then
 	echo "tag_example2"
-	$exampledir/tag_example/tag_example2 -iter 64 -i 128
+	$STARPU_LAUNCH $exampledir/tag_example/tag_example2 -iter 64 -i 128
 	check_success $?
     fi
 
     if [ -f $exampledir/cholesky/dw_cholesky ] ; then
 	echo "chol.dm"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $exampledir/cholesky/dw_cholesky -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/cholesky/dw_cholesky -pin
 	check_success $?
 
 	echo "chol.dmda"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dmda" $exampledir/cholesky/dw_cholesky -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $exampledir/cholesky/dw_cholesky -pin
 	check_success $?
 
 	echo "chol.cpu"
-	STARPU_CALIBRATE=1 STARPU_NCUDA=0 STARPU_SCHED="dm" $exampledir/cholesky/dw_cholesky -pin
+	STARPU_CALIBRATE=1 STARPU_NCUDA=0 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/cholesky/dw_cholesky -pin
 	check_success $?
 
 	echo "chol.gpu"
-	STARPU_CALIBRATE=1 STARPU_NCPUS=0 STARPU_SCHED="dm" $exampledir/cholesky/dw_cholesky -pin
+	STARPU_CALIBRATE=1 STARPU_NCPUS=0 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/cholesky/dw_cholesky -pin
 	check_success $?
     fi
 
     if [ -f $exampledir/heat/heat ] ; then
 	echo "heat.dm.4k.calibrate.v2"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $exampledir/heat/heat -ntheta 66 -nthick 66 -nblocks 4 -v2 -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 66 -nblocks 4 -v2 -pin
 	check_success $?
 
 	echo "heat.dm.8k.calibrate.v2"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -v2 -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -v2 -pin
 	check_success $?
 
 	echo "heat.dm.8k.no.pin.v2"
-	STARPU_SCHED="dm" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -v2
+	STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -v2
 	check_success $?
 
 #	echo "heat.dm.8k.v2.no.prio"
-#	STARPU_SCHED="no-prio" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
+#	STARPU_SCHED="no-prio" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
 #	check_success $?
 
 	echo "heat.dm.8k.v2.random"
-	STARPU_SCHED="random" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
+	STARPU_SCHED="random" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
 	check_success $?
 
 	echo "heat.dm.8k.v2"
-	STARPU_SCHED="dm" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
+	STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
 	check_success $?
 
 	echo "heat.greedy.8k.v2"
-	STARPU_SCHED="greedy" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
+	STARPU_SCHED="greedy" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2
 	check_success $?
 
 	echo "heat.8k.cg"
-	$exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 -cg
+	$STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 -cg
 	check_success $?
 
 	echo "heat.dm.8k.cg"
-	STARPU_SCHED="dm" $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 -cg
+	STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 -cg
 	check_success $?
     fi
 
     if [ -f $exampledir/mult/dw_mult_no_stride ] ; then
 	echo "mult.dm.common"
-	STARPU_SCHED="dm" $exampledir/mult/dw_mult_no_stride -nblocks 4 -x 4096 -y 4096 -z 1024 -pin -common-model
+	STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/mult/dw_mult_no_stride -nblocks 4 -x 4096 -y 4096 -z 1024 -pin -common-model
 	check_success $?
 
 	echo "mult.dm"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $exampledir/mult/dw_mult_no_stride -nblocks 8 -x 4096 -y 4096 -z 4096 -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dm" $STARPU_LAUNCH $exampledir/mult/dw_mult_no_stride -nblocks 8 -x 4096 -y 4096 -z 4096 -pin
 	check_success $?
 
 	echo "mult.dmda"
-	STARPU_CALIBRATE=1 STARPU_SCHED="dmda" $exampledir/mult/dw_mult_no_stride -nblocks 8 -x 4096 -y 4096 -z 4096 -pin
+	STARPU_CALIBRATE=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $exampledir/mult/dw_mult_no_stride -nblocks 8 -x 4096 -y 4096 -z 4096 -pin
 	check_success $?
     fi
 }

+ 51 - 4
tests/datawizard/acquire_cb.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011,2013,2014,2016                      Université de Bordeaux
+ * Copyright (C) 2011,2013,2014,2016, 2020                      Université de Bordeaux
  * Copyright (C) 2011-2013                                Inria
  * Copyright (C) 2011-2013,2017                           CNRS
  *
@@ -20,20 +20,27 @@
 #include "../helper.h"
 
 /*
- * Test that when using starpu_data_acquire_cb, the callback is properly called
+ * Test that when using starpu_data_acquire_cb, the callback_w is properly called
  */
 
 unsigned token = 0;
 starpu_data_handle_t token_handle;
 
 static
-void callback(void *arg)
+void callback_w(void *arg)
 {
 	(void)arg;
 	token = 42;
         starpu_data_release(token_handle);
 }
 
+static
+void callback_r(void *arg)
+{
+	(void)arg;
+        starpu_data_release(token_handle);
+}
+
 int main(int argc, char **argv)
 {
 	int ret;
@@ -42,8 +49,48 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	starpu_variable_data_register(&token_handle, -1, 0, sizeof(unsigned));
+	starpu_data_acquire_cb(token_handle, STARPU_W, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_unregister(token_handle);
+	STARPU_ASSERT(token == 42);
+
+	token = 0;
+
+	starpu_variable_data_register(&token_handle, -1, 0, sizeof(unsigned));
+	starpu_data_acquire(token_handle, STARPU_W);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_release(token_handle);
+	starpu_data_unregister(token_handle);
+
+	token = 0;
+
 	starpu_variable_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, sizeof(unsigned));
-        starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+	/* These are getting executed immediately */
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_W, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_W, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_RW, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_RW, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+
+	starpu_data_acquire(token_handle, STARPU_W);
+	/* These will wait for our relase */
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_W, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_W, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_RW, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_RW, callback_w, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_acquire_cb(token_handle, STARPU_R, callback_r, NULL);
+	starpu_data_release(token_handle);
 
 	starpu_data_unregister(token_handle);
 

+ 1 - 1
tests/datawizard/interfaces/test_interfaces.sh

@@ -17,7 +17,7 @@
 
 for i in bcsr block coo csr matrix multiformat variable vector void
 do
-    ./tests/datawizard/interfaces/$i/${i}_interface
+    $STARPU_LAUNCH ./tests/datawizard/interfaces/$i/${i}_interface
     ret=$?
     if test "$ret" = "0"
     then

+ 3 - 3
tests/datawizard/locality.sh

@@ -4,7 +4,7 @@
 # Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
 # Copyright (C) 2017                                     CNRS
 # Copyright (C) 2017                                     Inria
-# Copyright (C) 2017,2018-2019                           Université de Bordeaux
+# Copyright (C) 2017,2018-2020                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -33,8 +33,8 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 fi
 
 test -x $PREFIX/../../tools/starpu_fxt_tool || exit 77
-STARPU_SCHED=modular-eager STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/locality
-$PREFIX/../../tools/starpu_fxt_tool -memory-states -label-deps -i $PREFIX/prof_file_${USER}_0
+STARPU_SCHED=modular-eager STARPU_FXT_PREFIX=$PREFIX/ $STARPU_LAUNCH $PREFIX/locality
+$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_tool -memory-states -label-deps -i $PREFIX/prof_file_${USER}_0
 
 # Check that they are approved by Grenoble :)
 

+ 5 - 5
tests/experiments/bandwidth_cuda/bench_bandwidth.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011,2014                           Université de Bordeaux
+# Copyright (C) 2009-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,7 @@ echo "H -> D"
 for log in `seq 1 13`
 do
 	size=$((2**$log))
-	echo "$size	`./cuda_bandwidth -pin -HtoD -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/htod-pin.data 
+	echo "$size	`$STARPU_LAUNCH ./cuda_bandwidth -pin -HtoD -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/htod-pin.data 
 done
 
 echo "D -> H"
@@ -33,7 +33,7 @@ echo "D -> H"
 for log in `seq 1 13`
 do
 	size=$((2**$log))
-	echo "$size	`./cuda_bandwidth -pin -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/dtoh-pin.data 
+	echo "$size	`$STARPU_LAUNCH ./cuda_bandwidth -pin -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/dtoh-pin.data 
 done
 
 ./bench_bandwidth.gp
@@ -48,7 +48,7 @@ do
 	for log in `seq 1 $stridelog`
 	do
 		size=$((2**$log))
-		echo "$size	`./cuda_bandwidth -pin -HtoD -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/htod-pin.$stridesize.data 
+		echo "$size	`$STARPU_LAUNCH ./cuda_bandwidth -pin -HtoD -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/htod-pin.$stridesize.data 
 	done
 done
 
@@ -62,6 +62,6 @@ do
 	for log in `seq 1 $stridelog`
 	do
 		size=$((2**$log))
-		echo "$size	`./cuda_bandwidth -pin -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/dtoh-pin.$stridesize.data 
+		echo "$size	`$STARPU_LAUNCH ./cuda_bandwidth -pin -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/dtoh-pin.$stridesize.data 
 	done
 done

+ 2 - 2
tests/heat/deps.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,7 @@ trace_deps()
 		for iter in `seq 1 $maxiter`
 		do
 			echo "$iter / $maxiter"
-			 val=`$ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
+			 val=`$STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
 			 echo "$val" >> $filename
 		done
 	done

+ 4 - 4
tests/heat/dmda.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011,2014, 2019                           Université de Bordeaux
+# Copyright (C) 2009-2011,2014,2019-2020                 Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -38,7 +38,7 @@ calibrate_point()
 		export STARPU_SCHED=$strat
 		export STARPU_CALIBRATE=1
 		export STARPU_PREFETCH=$prefetch
-		val=`$ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3 2> /dev/null`
+		val=`$STARPU_LAUNCH $ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3 2> /dev/null`
 		echo "$val"
 	done
 
@@ -76,14 +76,14 @@ do
 	export STARPU_SCHED="dm"
 	export STARPU_CALIBRATE=1
 	export STARPU_PREFETCH=1
-	valdm=$($ROOTDIR/examples/heat/heat -pin -size $size -nblocks $nblocks -v3 2> logdm)
+	valdm=$($STARPU_LAUNCH $ROOTDIR/examples/heat/heat -pin -size $size -nblocks $nblocks -v3 2> logdm)
 
 	calibrate_point "dmda" $nblocks 1
 
 	export STARPU_SCHED="dmda"
 	export STARPU_CALIBRATE=1
 	export STARPU_PREFETCH=1
-	valdmda=$($ROOTDIR/examples/heat/heat -pin -size $size -nblocks $nblocks -v3 2> logdmda)
+	valdmda=$($STARPU_LAUNCH $ROOTDIR/examples/heat/heat -pin -size $size -nblocks $nblocks -v3 2> logdmda)
 	
 	dmmiss=`grep "TOTAL MSI" logdm|sed -e "s/.*miss.*[1-9]* (\(.*\) %)/\1/"`
 	dmtotal=`grep "TOTAL transfers" logdm|sed -e "s/TOTAL transfers \(.*\) MB/\1/"`

+ 2 - 2
tests/heat/gflops.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -49,7 +49,7 @@ measure_heat()
 	for i in `seq 1 $nsample`
 	do
 		echo "iter $i/$nsample"
-		val=`$ROOTDIR/examples/heat -nthick $thick -ntheta $theta -nblocks $nblocks -pin -v2 2>/dev/null`
+		val=`$STARPU_LAUNCH $ROOTDIR/examples/heat -nthick $thick -ntheta $theta -nblocks $nblocks -pin -v2 2>/dev/null`
 		total=`echo "$val + $total" |bc -l`
 	done
 

+ 2 - 2
tests/heat/gflops_sched.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -51,7 +51,7 @@ measure_heat()
 	for i in `seq 1 $nsample`
 	do
 		echo "iter $i/$nsample"
-		val=`STARPU_SCHED=$policy $ROOTDIR/examples/heat/heat -nthick $thick -ntheta $theta -nblocks $nblocks -pin -v2 2>/dev/null`
+		val=`STARPU_SCHED=$policy $STARPU_LAUNCH $ROOTDIR/examples/heat/heat -nthick $thick -ntheta $theta -nblocks $nblocks -pin -v2 2>/dev/null`
 		total=`echo "$val + $total" |bc -l`
 	done
 

+ 9 - 9
tests/heat/granularity.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014, 2019                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2019-2020                 Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -62,7 +62,7 @@ trace_granularity()
 		for iter in `seq 1 $maxiter`
 		do
 			echo "$iter / $maxiter"
-			 val=`STARPU_NCPUS=8 STARPU_NCUDA=3 STARPU_SCHED="dmda" STARPU_PREFETCH=1 STARPU_CALIBRATE=1 $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
+			 val=`STARPU_NCPUS=8 STARPU_NCUDA=3 STARPU_SCHED="dmda" STARPU_PREFETCH=1 STARPU_CALIBRATE=1 $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
 			 echo "$val"
 			 echo "$val" >> $filename
 		done
@@ -103,7 +103,7 @@ trace_granularity_hybrid()
 		for iter in `seq 1 $maxiter`
 		do
 			echo "$iter / $maxiter"
-			 val=`STARPU_SCHED="dmda" STARPU_PREFETCH=1 STARPU_CALIBRATE=1 $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
+			 val=`STARPU_SCHED="dmda" STARPU_PREFETCH=1 STARPU_CALIBRATE=1 $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
 			 echo "$val"
 			 echo "$val" >> $filename
 		done
@@ -123,12 +123,12 @@ calibrate_grain()
 
 	OPTIONS="-pin -nblocks $blocks -size $size -v3"
 
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_SCHED="dm" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null 
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
-	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_SCHED="dm" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null 
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
+	STARPU_NCUDA=3 STARPU_NCPUS=8 STARPU_CALIBRATE=1 STARPU_PREFETCH=1 STARPU_SCHED="dmda" $STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null
 }
 
 mkdir -p $TIMINGDIR

+ 5 - 5
tests/heat/heat.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -47,7 +47,7 @@ do
 			export STARPU_NCUDA=$cublas
 
 			echo "size $size cpus $cpus cublas $cublas blocks $blocks" 
-			$ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $BLOCKS 2>/dev/null| tee $filename
+			$STARPU_LAUNCH $ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $BLOCKS 2>/dev/null| tee $filename
 		done
 	done
 
@@ -63,7 +63,7 @@ do
 			export STARPU_NCUDA=$cublas
 
 			echo "size $size cpus $cpus cublas $cublas blocks $blocks" 
-			$ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $BLOCKS 2>/dev/null| tee $filename
+			$STARPU_LAUNCH $ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $BLOCKS 2>/dev/null| tee $filename
 		done
 	done
 done
@@ -90,14 +90,14 @@ do
 
 		echo "size $size cpus 4 cublas 0 blocks $blocks"
 		filename=$TIMINGDIR/timing.4.0.$size.$blocks
-		$ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $blocks 2>/dev/null| tee $filename
+		$STARPU_LAUNCH $ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $blocks 2>/dev/null| tee $filename
 
 		export STARPU_NCPUS=3
 		export STARPU_NCUDA=1
 
 		echo "size $size cpus 3 cublas 1 blocks $blocks"
 		filename=$TIMINGDIR/timing.3.1.$size.$blocks
-		$ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $blocks 2>/dev/null| tee $filename
+		$STARPU_LAUNCH $ROOTDIR/examples/heat -nthick 34 -ntheta $(($theta+2)) -nblocks $blocks 2>/dev/null| tee $filename
 	done
 done
 

+ 2 - 2
tests/heat/model_perturbation.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014, 2019                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2019-2020                 Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -57,7 +57,7 @@ trace_perturbation()
 		for iter in `seq 1 $nsamples`
 		do
 			echo "$iter / $nsamples"
-			 val=`$ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
+			 val=`$STARPU_LAUNCH $ROOTDIR/examples/heat/heat $OPTIONS 2> /dev/null`
 			 echo "$val" >> $filename
 		done
 	done

+ 3 - 3
tests/heat/sched.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014, 2019                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014, 2019-2020                           Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -38,7 +38,7 @@ calibrate_point()
 		export STARPU_SCHED=$strat
 		export STARPU_CALIBRATE=1
 		export STARPU_PREFETCH=$prefetch
-		val=`$ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3 2> /dev/null`
+		val=`$STARPU_LAUNCH $ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3 2> /dev/null`
 		echo "$val"
 	done
 
@@ -68,7 +68,7 @@ trace_point()
 		export STARPU_SCHED=$strat
 		export STARPU_CALIBRATE=$docalibrate
 		export STARPU_PREFETCH=$prefetch
-		val=`$ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3  2> /dev/null`
+		val=`$STARPU_LAUNCH $ROOTDIR/examples/heat/heat -pin -nblocks $nblocks -size $size -v3  2> /dev/null`
 		echo "$val"
 		echo "$val" >> $filename
 	done

+ 2 - 2
tests/heat/speedup.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -41,7 +41,7 @@ do
 		echo "ncpus $cpus size $size"
 
 		filename=$TIMINGDIR/timing.$cpus.$size
-		$ROOTDIR/examples/heat -v2 -pin -nthick 34 -ntheta $(($theta+2)) -nblocks 16 2>/dev/null| tee $filename
+		$STARPU_LAUNCH $ROOTDIR/examples/heat -v2 -pin -nthick 34 -ntheta $(($theta+2)) -nblocks 16 2>/dev/null| tee $filename
 
 		echo "$cpus	`cat $TIMINGDIR/timing.$cpus.$size`	`cat  $TIMINGDIR/timing.1.$size`" >> $TIMINGDIR/speedup.$size
 	done

+ 2 - 2
tests/incrementer/speed.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010,2011,2014                           Université de Bordeaux
+# Copyright (C) 2010,2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -27,7 +27,7 @@ max=20
 
 for logi in `seq 0 $max`
 do
-	$examplebindir/incrementer $i 2> .tmpperf
+	$STARPU_LAUNCH $examplebindir/incrementer $i 2> .tmpperf
 
 	grep "ms" .tmpperf
 	grep "ms" .tmpperf | sed -e "s/^\(.*\) elems took \(.*\) ms$/\1	\2/" >> .perftable 

+ 51 - 13
tests/loader.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012,2017                           Inria
  * Copyright (C) 2011-2020                                CNRS
- * Copyright (C) 2010,2014-2018                           Université de Bordeaux
+ * Copyright (C) 2010,2014-2018,2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -219,6 +219,8 @@ int main(int argc, char *argv[])
 	char *test_args;
 	char *launcher;
 	char *launcher_args;
+	char *libtool;
+	const char *top_builddir = getenv ("top_builddir");
 	struct sigaction sa;
 	int   ret;
 	struct timeval start;
@@ -291,6 +293,54 @@ int main(int argc, char *argv[])
 	if (launcher_args)
 		launcher_args=strdup(launcher_args);
 
+	if (top_builddir == NULL)
+	{
+		fprintf(stderr,
+			"warning: $top_builddir undefined, "
+			"so $STARPU_CHECK_LAUNCHER ignored\n");
+		launcher = NULL;
+		launcher_args = NULL;
+		libtool = NULL;
+	}
+	else
+	{
+		libtool = malloc(strlen(top_builddir) + 1 + strlen("libtool") + 1);
+		strcpy(libtool, top_builddir);
+		strcat(libtool, "/libtool");
+	}
+
+	if (launcher)
+	{
+		const char *top_srcdir = getenv("top_srcdir");
+		decode(&launcher, "@top_srcdir@", top_srcdir);
+		decode(&launcher_args, "@top_srcdir@", top_srcdir);
+	}
+
+	size_t len = strlen(test_name);
+	if (launcher && len >= 3 &&
+	    test_name[len-3] == '.' &&
+	    test_name[len-2] == 's' &&
+	    test_name[len-1] == 'h')
+	{
+		/* This is a shell script, don't run the check on bash, but pass
+		 * the script the decoded variables */
+		setenv("STARPU_CHECK_LAUNCHER", launcher, 1);
+		if (launcher_args)
+			setenv("STARPU_CHECK_LAUNCHER_ARGS", launcher_args, 1);
+		else
+			launcher_args = "";
+
+		/* And give a convenience macro */
+		size_t len_launch = strlen(libtool) + 1 + strlen("--mode=execute") + 1
+				  + strlen(launcher) + 1 + strlen(launcher_args) + 1;
+		char *launch = malloc(len_launch);
+		snprintf(launch, len_launch, "%s --mode=execute %s %s", libtool, launcher, launcher_args);
+		setenv("STARPU_LAUNCH", launch, 1);
+
+		launcher = NULL;
+		launcher_args = NULL;
+	}
+
 	setenv("STARPU_OPENCL_PROGRAM_DIR", STARPU_SRC_DIR, 1);
 
 	/* set SIGALARM handler */
@@ -308,19 +358,10 @@ int main(int argc, char *argv[])
 			/* "Launchers" such as Valgrind need to be inserted
 			 * after the Libtool-generated wrapper scripts, hence
 			 * this special-case.  */
-			const char *top_builddir = getenv ("top_builddir");
-			const char *top_srcdir = getenv("top_srcdir");
 			if (top_builddir != NULL)
 			{
 				char *launcher_argv[100];
 				int i=3;
-				char libtool[strlen(top_builddir)
-					     + sizeof("libtool") + 1];
-				strcpy(libtool, top_builddir);
-				strcat(libtool, "/libtool");
-
-				decode(&launcher, "@top_srcdir@", top_srcdir);
-				decode(&launcher_args, "@top_srcdir@", top_srcdir);
 
 				launcher_argv[0] = libtool;
 				launcher_argv[1] = "--mode=execute";
@@ -341,9 +382,6 @@ int main(int argc, char *argv[])
 			}
 			else
 			{
-				fprintf(stderr,
-					"warning: $top_builddir undefined, "
-					"so $STARPU_CHECK_LAUNCHER ignored\n");
 				execl(test_name, test_name, test_args, NULL);
 			}
 		}

+ 2 - 2
tests/main/combined_workers/bfs/run.sh

@@ -3,7 +3,7 @@
 #
 # Copyright (C) 2012,2016,2017                           CNRS
 # Copyright (C) 2012                                     Inria
-# Copyright (C) 2014,2019                                Université de Bordeaux
+# Copyright (C) 2014,2019-2020                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -31,4 +31,4 @@ export STARPU_NCUDA=0
 export STARPU_NOPENCL=0
 export STARPU_WORKER_STATS=1
 export STARPU_CALIBRATE=1
-./bfs data/graph65536.txt
+$STARPU_LAUNCH ./bfs data/graph65536.txt

+ 2 - 2
tests/memory/memstress.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -41,7 +41,7 @@ trace_stress()
 		echo "Computing size $size with $memstress MB of memory LESS"
 		
 		echo "$ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null"
-		timing=`$ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
+		timing=`$STARPU_LAUNCH $ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
 	
 		echo "size : $size memstress $memstress => $timing us"
 

+ 2 - 2
tests/memory/memstress2.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,7 @@ trace_stress()
 
 		
 		echo "$ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null"
-		timing=`$ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
+		timing=`$STARPU_LAUNCH $ROOTDIR/examples/mult/dw_mult -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
 	
 		echo "size : $size memstress $stress => $timing us"
 

+ 19 - 0
tests/microbenchs/async_tasks_data_overhead.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020                                     Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+ROOT=${0%.sh}
+ROOT=${ROOT/tasks_data_overhead/tasks_overhead}
+exec $STARPU_LAUNCH $ROOT -b 1 "$@"

+ 53 - 25
tests/microbenchs/async_tasks_overhead.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2014,2016                           Université de Bordeaux
+ * Copyright (C) 2009-2014,2016,2020                      Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,7 +24,17 @@
  * Measure the cost of submitting asynchronous tasks
  */
 
+starpu_data_handle_t data_handles[8];
+float *buffers[8];
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 128;
+#else
 static unsigned ntasks = 65536;
+#endif
+static unsigned nbuffers = 0;
+
+#define BUFFERSIZE 16
 
 //static unsigned finished = 0;
 
@@ -45,36 +55,29 @@ static struct starpu_codelet dummy_codelet =
         .opencl_funcs = {dummy_func},
 	.cpu_funcs_name = {"dummy_func"},
 	.model = NULL,
-	.nbuffers = 0
+	.nbuffers = 0,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW}
 };
 
-//static void inject_one_task(void)
-//{
-//	struct starpu_task *task = starpu_task_create();
-//
-//	task->cl = &dummy_codelet;
-//	task->cl_arg = NULL;
-//	task->detach = 0;
-//
-//	int ret = starpu_task_submit(task);
-//	STARPU_ASSERT(!ret);
-//}
-
 static void usage(char **argv)
 {
-	fprintf(stderr, "%s [-i ntasks] [-p sched_policy] [-h]\n", argv[0]);
-	exit(-1);
+	fprintf(stderr, "Usage: %s [-i ntasks] [-p sched_policy] [-b nbuffers] [-h]\n", argv[0]);
+	exit(EXIT_FAILURE);
 }
 
 static void parse_args(int argc, char **argv, struct starpu_conf *conf)
 {
 	int c;
-	while ((c = getopt(argc, argv, "i:p:h")) != -1)
+	while ((c = getopt(argc, argv, "i:b:p:h")) != -1)
 	switch(c)
 	{
 		case 'i':
 			ntasks = atoi(optarg);
 			break;
+		case 'b':
+			nbuffers = atoi(optarg);
+			dummy_codelet.nbuffers = nbuffers;
+			break;
 		case 'p':
 			conf->sched_policy_name = optarg;
 			break;
@@ -96,19 +99,22 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 	conf.ncpus = 2;
 
-#ifdef STARPU_QUICK_CHECK
-	ntasks = 128;
-#endif
-
 	parse_args(argc, argv, &conf);
 
 	ret = starpu_initialize(&conf, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	unsigned buffer;
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		starpu_malloc((void**)&buffers[buffer], BUFFERSIZE*sizeof(float));
+		starpu_vector_data_register(&data_handles[buffer], STARPU_MAIN_RAM, (uintptr_t)buffers[buffer], BUFFERSIZE, sizeof(float));
+	}
+
 	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
 
-	fprintf(stderr, "#tasks : %u\n", ntasks);
+	fprintf(stderr, "#tasks : %u\n#buffers : %u\n", ntasks, nbuffers);
 
 	/* Create an array of tasks */
 	struct starpu_task **tasks = (struct starpu_task **) malloc(ntasks*sizeof(struct starpu_task *));
@@ -117,8 +123,14 @@ int main(int argc, char **argv)
 	{
 		struct starpu_task *task = starpu_task_create();
 		task->cl = &dummy_codelet;
-		task->cl_arg = NULL;
 		task->detach = 0;
+
+		/* we have 8 buffers at most */
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			task->handles[buffer] = data_handles[buffer];
+		}
+
 		tasks[i] = task;
 	}
 
@@ -165,21 +177,37 @@ int main(int argc, char **argv)
 
                 if (output_dir && bench_id)
 		{
+                        char number[1+sizeof(nbuffers)*3+1];
+                        const char *numberp;
                         char file[1024];
                         FILE *f;
 
-                        snprintf(file, sizeof(file), "%s/async_tasks_overhead_total.dat", output_dir);
+                        if (nbuffers)
+                        {
+                                snprintf(number, sizeof(number), "_%u", nbuffers);
+                                numberp = number;
+                        }
+                        else
+                                numberp = "";
+
+                        snprintf(file, sizeof(file), "%s/async_tasks_overhead_total%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing/1000000);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/async_tasks_overhead_per_task.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/async_tasks_overhead_per_task%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing/ntasks);
                         fclose(f);
                 }
         }
 
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		starpu_data_unregister(data_handles[buffer]);
+		starpu_free((void*)buffers[buffer]);
+	}
+
 	starpu_shutdown();
 	free(tasks);
 

+ 2 - 2
tests/microbenchs/microbench.sh

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2016,2017                                CNRS
-# Copyright (C) 2016,2017,2019                           Université de Bordeaux
+# Copyright (C) 2016,2017,2019-2020                      Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -50,7 +50,7 @@ test_scheds()
 	for sched in $SCHEDS;
 	do
 	    	set +e
-		STARPU_SCHED=$sched $(dirname $0)/$TEST "$@"
+		STARPU_SCHED=$sched $STARPU_LAUNCH $(dirname $0)/$TEST "$@"
 		ret=$?
 	    	set -e
 		if test $ret = 0

+ 2 - 2
tests/microbenchs/starpu_check.sh

@@ -1,7 +1,7 @@
 #!/bin/sh
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011,2014                           Université de Bordeaux
+# Copyright (C) 2009-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2011,2015,2017                      CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -40,7 +40,7 @@ test_with_timeout()
 
 	echo "$application"
 
-	$application > /dev/null 2> /dev/null & _pid_appli=$!;
+	$STARPU_LAUNCH $application > /dev/null 2> /dev/null & _pid_appli=$!;
 	(sleep $timeout ; kill -9 $_pid_appli 2> /dev/null) & _pid_killer=$!
 	wait $_pid_appli
 	ret=$?

+ 19 - 0
tests/microbenchs/sync_tasks_data_overhead.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020                                     Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+ROOT=${0%.sh}
+ROOT=${ROOT/tasks_data_overhead/tasks_overhead}
+exec $STARPU_LAUNCH $ROOT -b 1 "$@"

+ 70 - 15
tests/microbenchs/sync_tasks_overhead.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2014,2016                           Université de Bordeaux
+ * Copyright (C) 2009-2014,2016,2020                      Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,17 @@
  * Measure the cost of submitting synchronous tasks
  */
 
+starpu_data_handle_t data_handles[8];
+float *buffers[8];
+
+#ifdef STARPU_QUICK_CHECK
+static unsigned ntasks = 128;
+#else
 static unsigned ntasks = 65536;
+#endif
+static unsigned nbuffers = 0;
+
+#define BUFFERSIZE 16
 
 void dummy_func(void *descr[], void *arg)
 {
@@ -40,11 +50,11 @@ static struct starpu_codelet dummy_codelet =
         .opencl_funcs = {dummy_func},
 	.cpu_funcs_name = {"dummy_func"},
 	.model = NULL,
-	.nbuffers = 0
+	.nbuffers = 0,
+	.modes = {STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW}
 };
 
-static
-int inject_one_task(void)
+static int inject_one_task(void)
 {
 	int ret;
 	struct starpu_task *task = starpu_task_create();
@@ -59,15 +69,31 @@ int inject_one_task(void)
 
 }
 
-static void parse_args(int argc, char **argv)
+static void usage(char **argv)
+{
+	fprintf(stderr, "Usage: %s [-i ntasks] [-p sched_policy] [-b nbuffers] [-h]\n", argv[0]);
+	exit(EXIT_FAILURE);
+}
+
+static void parse_args(int argc, char **argv, struct starpu_conf *conf)
 {
 	int c;
-	while ((c = getopt(argc, argv, "i:")) != -1)
+	while ((c = getopt(argc, argv, "i:b:p:h")) != -1)
 	switch(c)
 	{
 		case 'i':
 			ntasks = atoi(optarg);
 			break;
+		case 'b':
+			nbuffers = atoi(optarg);
+			dummy_codelet.nbuffers = nbuffers;
+			break;
+		case 'p':
+			conf->sched_policy_name = optarg;
+			break;
+		case 'h':
+			usage(argv);
+			break;
 	}
 }
 
@@ -82,22 +108,35 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 	conf.ncpus = 2;
 
-#ifdef STARPU_QUICK_CHECK
-	ntasks = 128;
-#endif
-
-	parse_args(argc, argv);
+	parse_args(argc, argv, &conf);
 
 	ret = starpu_initialize(&conf, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	fprintf(stderr, "#tasks : %u\n", ntasks);
+	unsigned buffer;
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		starpu_malloc((void**)&buffers[buffer], BUFFERSIZE*sizeof(float));
+		starpu_vector_data_register(&data_handles[buffer], STARPU_MAIN_RAM, (uintptr_t)buffers[buffer], BUFFERSIZE, sizeof(float));
+	}
+
+	fprintf(stderr, "#tasks : %u\n#buffers : %u\n", ntasks, nbuffers);
 
 	start = starpu_timing_now();
 	for (i = 0; i < ntasks; i++)
 	{
-		ret = inject_one_task();
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &dummy_codelet;
+		task->synchronous = 1;
+
+		/* we have 8 buffers at most */
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			task->handles[buffer] = data_handles[buffer];
+		}
+
+		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
@@ -114,21 +153,37 @@ int main(int argc, char **argv)
 
                 if (output_dir && bench_id)
 		{
+                        char number[1+sizeof(nbuffers)*3+1];
+                        const char *numberp;
                         char file[1024];
                         FILE *f;
 
-                        snprintf(file, sizeof(file), "%s/sync_tasks_overhead_total.dat", output_dir);
+                        if (nbuffers)
+                        {
+                                snprintf(number, sizeof(number), "_%u", nbuffers);
+                                numberp = number;
+                        }
+                        else
+                                numberp = "";
+
+                        snprintf(file, sizeof(file), "%s/sync_tasks_overhead_total%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing/1000000);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/sync_tasks_overhead_per_task.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/sync_tasks_overhead_per_task%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing/ntasks);
                         fclose(f);
                 }
         }
 
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		starpu_data_unregister(data_handles[buffer]);
+		starpu_free((void*)buffers[buffer]);
+	}
+
 	starpu_shutdown();
 
 	return EXIT_SUCCESS;

+ 19 - 0
tests/microbenchs/tasks_data_overhead.sh

@@ -0,0 +1,19 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020                                     Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+ROOT=${0%.sh}
+ROOT=${ROOT/tasks_data_overhead/tasks_overhead}
+exec $STARPU_LAUNCH $ROOT -b 1 "$@"

+ 63 - 42
tests/microbenchs/tasks_overhead.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011,2013,2014,2016                 Université de Bordeaux
+ * Copyright (C) 2009-2011,2013,2014,2016,2020            Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
@@ -36,6 +36,8 @@ static unsigned ntasks = 65536;
 #endif
 static unsigned nbuffers = 0;
 
+#define BUFFERSIZE 16
+
 struct starpu_task *tasks;
 
 void dummy_func(void *descr[], void *arg)
@@ -55,25 +57,16 @@ static struct starpu_codelet dummy_codelet =
 	.modes = {STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW, STARPU_RW}
 };
 
-static
-int inject_one_task(void)
+static void usage(char **argv)
 {
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &dummy_codelet;
-	task->cl_arg = NULL;
-	task->callback_func = NULL;
-	task->synchronous = 1;
-
-	int ret;
-	ret = starpu_task_submit(task);
-	return ret;
+	fprintf(stderr, "Usage: %s [-i ntasks] [-p sched_policy] [-b nbuffers] [-h]\n", argv[0]);
+	exit(EXIT_FAILURE);
 }
 
-static void parse_args(int argc, char **argv)
+static void parse_args(int argc, char **argv, struct starpu_conf *conf)
 {
 	int c;
-	while ((c = getopt(argc, argv, "i:b:h")) != -1)
+	while ((c = getopt(argc, argv, "i:b:p:h")) != -1)
 	switch(c)
 	{
 		case 'i':
@@ -83,8 +76,11 @@ static void parse_args(int argc, char **argv)
 			nbuffers = atoi(optarg);
 			dummy_codelet.nbuffers = nbuffers;
 			break;
+		case 'p':
+			conf->sched_policy_name = optarg;
+			break;
 		case 'h':
-			fprintf(stderr, "Usage: %s [-i ntasks] [-b nbuffers] [-h]\n", argv[0]);
+			usage(argv);
 			break;
 	}
 }
@@ -105,7 +101,7 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 	conf.ncpus = 2;
 
-	parse_args(argc, argv);
+	parse_args(argc, argv, &conf);
 
 	ret = starpu_initialize(&conf, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
@@ -114,8 +110,8 @@ int main(int argc, char **argv)
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
-		starpu_malloc((void**)&buffers[buffer], 16*sizeof(float));
-		starpu_vector_data_register(&data_handles[buffer], STARPU_MAIN_RAM, (uintptr_t)buffers[buffer], 16, sizeof(float));
+		starpu_malloc((void**)&buffers[buffer], BUFFERSIZE*sizeof(float));
+		starpu_vector_data_register(&data_handles[buffer], STARPU_MAIN_RAM, (uintptr_t)buffers[buffer], BUFFERSIZE, sizeof(float));
 	}
 
 	fprintf(stderr, "#tasks : %u\n#buffers : %u\n", ntasks, nbuffers);
@@ -126,9 +122,7 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 	{
 		starpu_task_init(&tasks[i]);
-		tasks[i].callback_func = NULL;
 		tasks[i].cl = &dummy_codelet;
-		tasks[i].cl_arg = NULL;
 		tasks[i].synchronous = 0;
 		tasks[i].use_tag = 1;
 		tasks[i].tag_id = (starpu_tag_t)i;
@@ -142,19 +136,33 @@ int main(int argc, char **argv)
 	tasks[ntasks-1].detach = 0;
 
 	start_submit = starpu_timing_now();
-	for (i = 1; i < ntasks; i++)
-	{
-		starpu_tag_declare_deps((starpu_tag_t)i, 1, (starpu_tag_t)(i-1));
-
-		ret = starpu_task_submit(&tasks[i]);
-		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	}
+        if (nbuffers)
+        {
+                /* Data dependency, just submit them all */
+                for (i = 0; i < ntasks; i++)
+                {
+                        ret = starpu_task_submit(&tasks[i]);
+                        if (ret == -ENODEV) goto enodev;
+                        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+                }
+        }
+        else
+        {
+                /* No data dependency, we have to introduce dependencies by hand */
+                for (i = 1; i < ntasks; i++)
+                {
+                        starpu_tag_declare_deps((starpu_tag_t)i, 1, (starpu_tag_t)(i-1));
+
+                        ret = starpu_task_submit(&tasks[i]);
+                        if (ret == -ENODEV) goto enodev;
+                        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+                }
 
-	/* submit the first task */
-	ret = starpu_task_submit(&tasks[0]);
-	if (ret == -ENODEV) goto enodev;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+                /* submit the first task */
+                ret = starpu_task_submit(&tasks[0]);
+                if (ret == -ENODEV) goto enodev;
+                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+        }
 
 	end_submit = starpu_timing_now();
 
@@ -169,9 +177,6 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 		starpu_task_clean(&tasks[i]);
 
-	for (buffer = 0; buffer < nbuffers; buffer++)
-		starpu_data_unregister(data_handles[buffer]);
-
 	timing_submit = end_submit - start_submit;
 	timing_exec = end_exec - start_exec;
 
@@ -190,41 +195,57 @@ int main(int argc, char **argv)
 
                 if (output_dir && bench_id)
 		{
+                        char number[1+sizeof(nbuffers)*3+1];
+                        const char *numberp;
                         char file[1024];
                         FILE *f;
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_submit.dat", output_dir);
+                        if (nbuffers)
+                        {
+                                snprintf(number, sizeof(number), "_%u", nbuffers);
+                                numberp = number;
+                        }
+                        else
+                                numberp = "";
+
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_submit%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing_submit/1000000);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_submit.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_submit%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing_submit/ntasks);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_execution.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_execution%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing_exec/1000000);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_execution.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_execution%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, timing_exec/ntasks);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_submit_execution.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_total_submit_execution%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, (timing_submit+timing_exec)/1000000);
                         fclose(f);
 
-                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_submit_execution.dat", output_dir);
+                        snprintf(file, sizeof(file), "%s/tasks_overhead_per_task_submit_execution%s.dat", output_dir, numberp);
                         f = fopen(file, "a");
                         fprintf(f, "%s\t%f\n", bench_id, (timing_submit+timing_exec)/ntasks);
                         fclose(f);
                 }
         }
 
+	for (buffer = 0; buffer < nbuffers; buffer++)
+	{
+		starpu_data_unregister(data_handles[buffer]);
+		starpu_free((void*)buffers[buffer]);
+	}
+
 	starpu_shutdown();
 	free(tasks);
 	return EXIT_SUCCESS;

+ 2 - 2
tests/microbenchs/tasks_size_overhead.sh

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010,2011,2013,2015,2017                 CNRS
-# Copyright (C) 2009,2010,2012,2014,2016                 Université de Bordeaux
+# Copyright (C) 2009,2010,2012,2014,2016,2020            Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,6 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 ROOT=${0%.sh}
-$ROOT "$@" > tasks_size_overhead.output
+$STARPU_LAUNCH $ROOT "$@" > tasks_size_overhead.output
 $ROOT.gp
 gv tasks_size_overhead.eps

+ 2 - 2
tests/microbenchs/tasks_size_overhead_sched.sh

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010,2011,2016,2017                      CNRS
-# Copyright (C) 2009,2010,2016                           Université de Bordeaux
+# Copyright (C) 2009,2010,2016,2020                      Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -18,7 +18,7 @@
 ROOT=${0%.sh}
 ROOT=${ROOT%_sched}
 unset STARPU_SSILENT
-$ROOT "$@" > tasks_size_overhead.output
+$STARPU_LAUNCH $_STARPU_LAUNCH $ROOT "$@" > tasks_size_overhead.output
 ret=$?
 if test "$ret" = "0"
 then

+ 3 - 1
tests/microbenchs/tasks_size_overhead_scheds.sh

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2016,2017                                CNRS
-# Copyright (C) 2016,2019                                Université de Bordeaux
+# Copyright (C) 2016,2019-2020                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -24,4 +24,6 @@ then
 	FAST="-i 8"
 fi
 
+_STARPU_LAUNCH="$STARPU_LAUNCH"
+unset STARPU_LAUNCH
 test_scheds tasks_size_overhead_sched.sh $FAST

+ 2 - 2
tests/mult/gflops.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2008-2011,2014                           Université de Bordeaux
+# Copyright (C) 2008-2011,2014,2020                      Université de Bordeaux
 # Copyright (C) 2010,2015,2017                           CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -51,7 +51,7 @@ trace_size()
 		if [ $tile -lt $size -a $nblocks -lt 32 -a $(($size % $tile)) == 0 ];
 		then
 			echo "start tile $tile size $size nblocks $nblocks  "
-			timing=`$ROOTDIR/examples/mult/dw_mult -pin -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
+			timing=`$STARPU_LAUNCH $ROOTDIR/examples/mult/dw_mult -pin -x $size -y $size -z $size -nblocks $nblocks 2>/dev/null`
 		else
 			timing="x"
 		fi

+ 0 - 0
tests/mult/sched.sh


Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä