Browse Source

Merge branch 'master' into fpga

Nathalie Furmento 6 years ago
parent
commit
7bad564662
100 changed files with 4744 additions and 870 deletions
  1. 1 0
      .gitignore
  2. 55 1
      ChangeLog
  3. 3 1
      Makefile.am
  4. 60 15
      configure.ac
  5. 3 0
      doc/Makefile.am
  6. 5 2
      doc/doxygen/Makefile.am
  7. 5 0
      doc/doxygen/chapters/000_introduction.doxy
  8. 17 14
      doc/doxygen/chapters/101_building.doxy
  9. 68 63
      doc/doxygen/chapters/210_check_list_performance.doxy
  10. 11 11
      doc/doxygen/chapters/301_tasks.doxy
  11. 7 2
      doc/doxygen/chapters/310_data_management.doxy
  12. 17 6
      doc/doxygen/chapters/320_scheduling.doxy
  13. 454 0
      doc/doxygen/chapters/370_online_performance_tools.doxy
  14. 1 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  15. 5 5
      doc/doxygen/chapters/470_simgrid.doxy
  16. 18 9
      doc/doxygen/chapters/501_environment_variables.doxy
  17. 5 5
      doc/doxygen/chapters/510_configure_options.doxy
  18. 7 0
      doc/doxygen/chapters/520_files.doxy
  19. 4 4
      doc/doxygen/chapters/api/threads.doxy
  20. 48 14
      doc/doxygen/dev/checkDoc.sh
  21. 0 28
      doc/doxygen/dev/sc_funcs.cocci
  22. 23 28
      doc/doxygen/dev/starpu_check_refs.sh
  23. 63 35
      doc/doxygen/dev/starpu_check_undocumented.sh
  24. 0 28
      doc/doxygen/dev/starpu_funcs.cocci
  25. 11 1
      doc/doxygen/doxygen-config.cfg.in
  26. 1 0
      doc/doxygen/doxygen.cfg
  27. 14 2
      doc/doxygen/refman.tex
  28. 5 0
      examples/Makefile.am
  29. 3 3
      examples/callback/prologue.c
  30. 4 3
      examples/cholesky/cholesky.sh
  31. 35 0
      examples/cpp/Makefile_add_vectors.mk
  32. 35 0
      examples/cpp/Makefile_add_vectors_cpp11.mk
  33. 672 0
      examples/cpp/add_vectors_interface.cpp
  34. 1 1
      examples/dependency/sequential_consistency.c
  35. 1 1
      examples/dependency/task_end_dep_add.c
  36. 131 0
      examples/perf_monitoring/perf_counters_01.c
  37. 241 0
      examples/perf_monitoring/perf_counters_02.c
  38. 131 0
      examples/perf_steering/perf_knobs_01.c
  39. 149 0
      examples/perf_steering/perf_knobs_02.c
  40. 180 0
      examples/perf_steering/perf_knobs_03.c
  41. 2 2
      examples/pipeline/pipeline.c
  42. 5 4
      examples/scheduler/dummy_modular_sched.c
  43. 5 4
      examples/scheduler/dummy_sched.c
  44. 6 1
      examples/scheduler/schedulers.sh
  45. 6 1
      examples/scheduler/schedulers_context.sh
  46. 2 1
      examples/stencil/Makefile.am
  47. 3 1
      include/fstarpu_mod.f90
  48. 2 0
      include/starpu.h
  49. 6 1
      include/starpu_config.h.in
  50. 13 0
      include/starpu_data_interfaces.h
  51. 203 0
      include/starpu_perf_monitoring.h
  52. 225 0
      include/starpu_perf_steering.h
  53. 76 2
      include/starpu_sched_component.h
  54. 50 17
      include/starpu_task.h
  55. 92 2
      include/starpu_task_util.h
  56. 51 3
      include/starpu_thread.h
  57. 99 7
      include/starpu_util.h
  58. 2 2
      include/starpu_worker.h
  59. 20 0
      mpi/examples/Makefile.am
  60. 2 1
      mpi/examples/complex/mpi_complex.c
  61. 0 5
      mpi/examples/filters/filter.c
  62. 11 4
      mpi/examples/mpi_lu/plu_implicit_example.c
  63. 10 9
      mpi/examples/native_fortran/nf_mm.f90
  64. 248 0
      mpi/examples/native_fortran/nf_mm_task_build.f90
  65. 4 1
      mpi/examples/user_datatype/my_interface.c
  66. 2 1
      mpi/examples/user_datatype/user_datatype.c
  67. 16 23
      mpi/include/fstarpu_mpi_mod.f90
  68. 7 2
      mpi/src/Makefile.am
  69. 2 2
      mpi/src/load_balancer/policy/load_heat_propagation.c
  70. 3 1
      mpi/src/mpi/starpu_mpi_comm.h
  71. 75 19
      mpi/src/mpi/starpu_mpi_early_data.c
  72. 11 1
      mpi/src/mpi/starpu_mpi_early_data.h
  73. 69 16
      mpi/src/mpi/starpu_mpi_early_request.c
  74. 10 1
      mpi/src/mpi/starpu_mpi_early_request.h
  75. 150 149
      mpi/src/mpi/starpu_mpi_mpi.c
  76. 117 0
      mpi/src/mpi/starpu_mpi_mpi_backend.c
  77. 80 0
      mpi/src/mpi/starpu_mpi_mpi_backend.h
  78. 6 6
      mpi/src/mpi/starpu_mpi_sync_data.c
  79. 3 3
      mpi/src/mpi/starpu_mpi_tag.c
  80. 51 51
      mpi/src/nmad/starpu_mpi_nmad.c
  81. 87 0
      mpi/src/nmad/starpu_mpi_nmad_backend.c
  82. 51 0
      mpi/src/nmad/starpu_mpi_nmad_backend.h
  83. 9 26
      mpi/src/starpu_mpi.c
  84. 2 2
      mpi/src/starpu_mpi_cache.c
  85. 5 5
      mpi/src/starpu_mpi_coop_sends.c
  86. 4 20
      mpi/src/starpu_mpi_init.c
  87. 48 84
      mpi/src/starpu_mpi_private.h
  88. 9 54
      mpi/src/starpu_mpi_req.c
  89. 43 2
      mpi/src/starpu_mpi_task_insert.c
  90. 72 24
      mpi/src/starpu_mpi_task_insert_fortran.c
  91. 3 1
      mpi/tests/Makefile.am
  92. 3 3
      mpi/tests/attr.c
  93. 8 3
      mpi/tests/block_interface.c
  94. 7 3
      mpi/tests/block_interface_pinned.c
  95. 5 5
      mpi/tests/callback.c
  96. 125 0
      mpi/tests/early_stuff.c
  97. 5 10
      socl/Makefile.am
  98. 7 1
      socl/src/Makefile.am
  99. 12 1
      src/Makefile.am
  100. 0 0
      src/common/fxt.c

+ 1 - 0
.gitignore

@@ -24,6 +24,7 @@
 .dirstamp
 .tramp_history
 *.pc
+*.vim
 stamp-h[0-9]*
 starpu.log
 /tests/datawizard/handle_to_pointer

+ 55 - 1
ChangeLog

@@ -20,8 +20,62 @@ StarPU 1.4.0 (svn revision xxxx)
 ==============================================
 New features:
   * Fault tolerance support with starpu_task_ft_failed().
+  * Add get_max_size method to data interfaces for applications using data with
+    variable size to express their maximal potential size.
 
-StarPU 1.3.2 (git revision xxx)
+Small changes:
+  * Use the S4U interface of Simgrid instead of xbt and MSG.
+
+StarPU 1.3.3 (git revision xxx)
+==============================================
+
+New features:
+  * New semantic for starpu_task_insert() and alike parameters
+    STARPU_CALLBACK_ARG, STARPU_PROLOGUE_CALLBACK_ARG, and
+    STARPU_PROLOGUE_CALLBACK_POP_ARG which set respectively
+    starpu_task::callback_arg_free,
+    starpu_task::prologue_callback_arg_free and
+    starpu_task::prologue_callback_pop_arg_free to 1 when used.
+    New parameters STARPU_CALLBACK_ARG_NFREE,
+    STARPU_CALLBACK_WITH_ARG_NFREE, STARPU_PROLOGUE_CALLBACK_ARG_NFREE, and
+    STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE which set the corresponding
+    fields of starpu_task to 0.
+  * starpufft: Support 3D.
+  * New modular-eager-prio scheduler.
+  * Add 'ready' heuristic to modular schedulers.
+  * New modular-heteroprio scheduler.
+  * Add STARPU_TASK_SCHED_DATA
+  * Add support for staging schedulers.
+  * New modular-heteroprio-heft scheduler.
+  * New dmdap "data-aware performance model (priority)" scheduler
+
+Changes:
+  * Modification in the Native Fortran interface of the functions
+    fstarpu_mpi_task_insert, fstarpu_mpi_task_build and
+    fstarpu_mpi_task_post_build to only take 1 parameter being the MPI
+    communicator, the codelet and the various parameters for the task.
+
+Small features:
+  * New starpu_task_insert() and alike parameter STARPU_TASK_WORKERIDS
+    allowing to set the fields starpu_task::workerids_len and
+    starpu_task::workerids
+  * New starpu_task_insert() and alike parameters
+    STARPU_SEQUENTIAL_CONSISTENCY, STARPU_TASK_NO_SUBMITORDER and
+    STARPU_TASK_PROFILING_INFO
+  * New function starpu_create_callback_task() which creates and
+    submits an empty task with the specified callback
+
+
+Small changes:
+   * Default modular worker queues to 2 tasks unless it's an heft
+     scheduler
+   * Separate out STATUS_SLEEPING_SCHEDULING state from
+     STATUS_SLEEPING state
+     When running the scheduler while being idle, workers do not go in
+     the STATUS_SCHEDULING state, so that that time is considered as
+     idle time instead of overhead.
+
+StarPU 1.3.2 (git revision af22a20fc00a37addf3cc6506305f89feed940b0)
 ==============================================
 
 Small changes:

+ 3 - 1
Makefile.am

@@ -109,7 +109,9 @@ versinclude_HEADERS = 				\
 	include/starpu_simgrid_wrap.h		\
 	include/starpu_mod.f90			\
 	include/fstarpu_mod.f90			\
-	include/starpu_clusters.h
+	include/starpu_clusters.h		\
+	include/starpu_perf_monitoring.h	\
+	include/starpu_perf_steering.h
 
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h

+ 60 - 15
configure.ac

@@ -276,12 +276,19 @@ if test x$enable_simgrid = xyes ; then
 	AC_CHECK_HEADERS([simgrid/host.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_HOST_H], [1], [Define to 1 if you have host.h in simgrid/.])])
 	AC_CHECK_HEADERS([simgrid/simdag.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SIMDAG_H], [1], [Define to 1 if you have simdag.h in simgrid/.])])
 	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
+	AC_CHECK_HEADERS([simgrid/actor.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_ACTOR_H], [1], [Define to 1 if you have actor.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/semaphore.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SEMAPHORE_H], [1], [Define to 1 if you have semaphore.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/mutex.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MUTEX_H], [1], [Define to 1 if you have mutex.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/cond.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_COND_H], [1], [Define to 1 if you have cond.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/barrier.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_BARRIER_H], [1], [Define to 1 if you have barrier.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/engine.h])
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 	# Latest functions
 	AC_CHECK_FUNCS([MSG_process_attach MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init])
 	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
+	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_cfg_set_int sg_actor_self_execute simgrid_get_clock])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 
 	# Oldies for compatibility with older simgrid
@@ -314,19 +321,23 @@ if test x$enable_simgrid = xyes ; then
 	if test x$enable_shared = xno ; then
 		# When linking statically, libtool does not realize we need libstdc++ for simgrid_cpp.cpp
 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
-		LDFLAGS="$LDFLAGS -lstdc++"
+		LIBS="$LIBS -lstdc++"
 	fi
 
-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
-			  #ifdef HAVE_SIMGRID_MSG_H
-			  #include <simgrid/msg.h>
-			  #include <simgrid/host.h>
-			  #else
-			  #include <msg/msg.h>
-			  #endif
-			  ]])],,
-			  CXXFLAGS="-std=c++11 $CXXFLAGS"
-			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
+	case \ $CXXFLAGS\  in 
+	*\ -std=*\ *) ;;
+	*) 
+		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+				  #ifdef STARPU_HAVE_SIMGRID_MSG_H
+				  #include <simgrid/msg.h>
+				  #include <simgrid/host.h>
+				  #else
+				  #include <msg/msg.h>
+				  #endif
+				  ]])],,
+				  CXXFLAGS="-std=c++11 $CXXFLAGS")
+		;;
+	esac
 	AC_LANG_POP([C++])
 	AC_ARG_ENABLE(simgrid-mc, [AS_HELP_STRING([--enable-simgrid-mc],
 				[Enable using Model Checker of simgrid])],
@@ -334,6 +345,7 @@ if test x$enable_simgrid = xyes ; then
 	if test x$enable_simgrid_mc = xyes ; then
 		AC_DEFINE(STARPU_SIMGRID_MC, [1], [Define this to enable Model Checker in simgrid execution])
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
+		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
 	fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
@@ -697,8 +709,14 @@ if test x$enable_simgrid = xno ; then
 fi
 
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$running_mpi_check = xyes -a x$enable_simgrid = xyes -a x$enable_shared = xyes ; then
-    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
+if test x$running_mpi_check = xyes -a x$enable_simgrid = xyes ; then
+    if test x$enable_shared = xyes ; then
+	AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
+    else
+	CFLAGS="$CFLAGS -fPIC"
+	CXXFLAGS="$CXXFLAGS -fPIC"
+	NVCCFLAGS="$NVCCFLAGS --compiler-options -fPIC"
+    fi
 fi
 if test x$use_mpi = xyes ; then
     AC_MSG_CHECKING(whether MPI tests should be run)
@@ -1416,6 +1434,25 @@ if test x$enable_cuda = xyes; then
 	STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcudart"
 	STARPU_CUFFT_LDFLAGS="-lcufft"
 
+	AC_LANG_PUSH([C++])
+	case \ $NVCCFLAGS\  in 
+	*\ -std=*\ *) ;;
+	*) 
+		SAVED_CXX="$CXX"
+		CXX="$NVCC"
+		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+				  #ifdef STARPU_HAVE_SIMGRID_MSG_H
+				  #include <simgrid/msg.h>
+				  #include <simgrid/host.h>
+				  #else
+				  #include <msg/msg.h>
+				  #endif
+				  ]])],,
+				  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
+		CXX="$SAVED_CXX"
+	esac
+	AC_LANG_POP([C++])
+
         if test "$F77" = "gfortran" -o "$FC" = "gfortran" ; then
             STARPU_CUDA_FORTRAN_LDFLAGS="-lgfortran"
             AC_SUBST(STARPU_CUDA_FORTRAN_LDFLAGS)
@@ -1458,7 +1495,7 @@ if test x$enable_cuda = xyes; then
 		])
 	    ],
 	    [
-	    AC_MSG_ERROR([NVML found, but nvml.h could not be compiled])
+	    AC_MSG_WARN([NVML found, but nvml.h could not be compiled])
 	    have_valid_nvml="no"
 	    ]
 	)
@@ -3072,9 +3109,15 @@ AC_SUBST(BLAS_LIB,$blas_lib)
 #			 Multiple linear regression			      #
 #                                                                             #
 ###############################################################################
+if test x$enable_simgrid = xyes ; then
+	# There is no need for building mlr models in simgrid mode
+	default_enable_mlr=no
+else
+	default_enable_mlr=yes
+fi
 AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
 			[Disable multiple linear regression models])],
-			enable_mlr=$enableval, enable_mlr=yes)
+			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
 
 AC_MSG_CHECKING(whether multiple linear regression models are disabled)
 if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
@@ -3542,6 +3585,8 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap

+ 3 - 0
doc/Makefile.am

@@ -33,6 +33,9 @@ EXTRA_DIST =    tutorial/hello_world.c \
 txtdir = ${docdir}/tutorial
 txt_DATA = $(EXTRA_DIST)
 
+recheck:
+	-cat /dev/null
+
 showcheck:
 	-cat /dev/null
 

+ 5 - 2
doc/doxygen/Makefile.am

@@ -155,7 +155,7 @@ chapters/version.sty: $(chapters)
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_sty
 	@if test -s timestamp_sty ; then \
-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%F" > timestamp_sty_updated ;\
 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
 	fi
 	@if test -s timestamp_sty_updated ; then \
@@ -174,7 +174,7 @@ chapters/version.html: $(chapters)
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_html
 	@if test -s timestamp_html ; then \
-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%F" > timestamp_html_updated ;\
 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
 	fi
 	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
@@ -198,6 +198,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_bound.h		\
 	$(top_srcdir)/include/starpu_clusters.h		\
 	$(top_srcdir)/include/starpu_cublas.h		\
+	$(top_srcdir)/include/starpu_cublas_v2.h	\
 	$(top_srcdir)/include/starpu_cusparse.h		\
 	$(top_srcdir)/include/starpu_cuda.h		\
 	$(top_srcdir)/include/starpu_data_filters.h	\
@@ -213,6 +214,8 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\
+	$(top_srcdir)/include/starpu_perf_monitoring.h	\
+	$(top_srcdir)/include/starpu_perf_steering.h	\
 	$(top_srcdir)/include/starpu_perfmodel.h	\
 	$(top_srcdir)/include/starpu_profiling.h	\
 	$(top_srcdir)/include/starpu_rand.h		\

+ 5 - 0
doc/doxygen/chapters/000_introduction.doxy

@@ -78,6 +78,11 @@ policies in a portable fashion (\ref HowToDefineANewSchedulingPolicy).
 
 The remainder of this section describes the main concepts used in StarPU.
 
+A video is available on the StarPU website
+http://starpu.gforge.inria.fr/ that presents these concepts in 26 minutes.
+
+Some tutorials are also available on http://starpu.gforge.inria.fr/tutorials/
+
 // explain the notion of codelet and task (i.e. g(A, B)
 
 \subsection CodeletAndTasks Codelet and Tasks

+ 17 - 14
doc/doxygen/chapters/101_building.doxy

@@ -55,7 +55,7 @@ location.
 
 If <c>libhwloc</c> is not available on your system, the option
 \ref without-hwloc "--without-hwloc" should be explicitely given when calling the
-<c>configure</c> script.
+script <c>configure</c>.
 
 
 \subsection GettingSources Getting Sources
@@ -88,8 +88,8 @@ $ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
 
 Running <c>autogen.sh</c> is not necessary when using the tarball
 releases of StarPU.  However when using the source code from the git
-repository, you first need to generate the configure scripts and the
-Makefiles. This requires the availability of <c>autoconf</c> and
+repository, you first need to generate the script <c>configure</c> and the
+different Makefiles. This requires the availability of <c>autoconf</c> and
 <c>automake</c> >= 2.60.
 
 \verbatim
@@ -113,7 +113,7 @@ is advised to put them all in a separate directory. It is then
 easier to cleanup, and this allows to compile several configurations
 out of the same source tree. To do so, simply enter the directory
 where you want the compilation to produce its files, and invoke the
-<c>configure</c> script located in the StarPU source directory.
+script <c>configure</c> located in the StarPU source directory.
 
 \verbatim
 $ mkdir build
@@ -139,7 +139,7 @@ $ make
 Once everything is built, you may want to test the result. An
 extensive set of regression tests is provided with StarPU. Running the
 tests is done by calling <c>make check</c>. These tests are run every night
-and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/).
+and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/master/).
 
 \verbatim
 $ make check
@@ -246,7 +246,7 @@ int main(void)
     {
         return 1;
     }
-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
     starpu_shutdown();
@@ -273,7 +273,7 @@ int main(void)
     {
         return 1;
     }
-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
     starpu_shutdown();
@@ -428,12 +428,13 @@ While StarPU tasks are executing, the application is not supposed to do
 computations in the threads it starts itself, tasks should be used instead.
 
 If the application needs to reserve some cores for its own computations, it
-can do so with the starpu_conf::reserve_ncpus field, get the core IDs with
+can do so with the field starpu_conf::reserve_ncpus, get the core IDs with
 starpu_get_next_bindid(), and bind to them with starpu_bind_thread_on().
 
-Another option is for the application to put the whole StarPU on pause with
-starpu_pause() before performing its own computations, and let StarPU resume
-executing tasks with starpu_resume().
+Another option is for the application to pause StarPU by calling
+starpu_pause(), then to perform its own computations, and then to
+resume StarPU by calling starpu_resume() so that StarPU can execute
+tasks.
 
 \subsection EnablingOpenCL Enabling OpenCL
 
@@ -499,12 +500,12 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
 <c>lu_*</c> perform an LU factorization. They use different dependency primitives.
 
-\subsection SimulatedBenchmarks Simulated benchmarks
+\subsection SimulatedBenchmarks Simulated Benchmarks
 
 It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
-using the simgrid version of StarPU: first install the simgrid simulator from
-http://simgrid.gforge.inria.fr/ (we tested with simgrid from 3.11 to 3.16, and
+using the SimGrid version of StarPU: first install the SimGrid simulator from
+http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
 3.18 to 3.22, other versions may have compatibility issues, 3.17 notably does
 not build at all. MPI simulation does not work with version 3.22),
 then configure StarPU with \ref enable-simgrid
@@ -527,4 +528,6 @@ Performance models are available for <c>cholesky_*</c>, <c>lu_*</c>, <c>*gemm</c
 320, 640, or 960 (plus 1440 for sirocco), and for <c>stencil</c> with block size 128x128x128, 192x192x192, and
 256x256x256.
 
+Read the chapter \ref SimGridSupport for more information on the SimGrid support.
+
 */

+ 68 - 63
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -28,8 +28,9 @@ will show roughly where time is spent, and focus correspondingly.
 
 \section CheckTaskSize Check Task Size
 
-Make sure that your tasks are not too small, because the StarPU runtime overhead
-is not completely zero. You can run the tasks_size_overhead.sh script to get an
+Make sure that your tasks are not too small, as the StarPU runtime overhead
+is not completely zero. As explained in \ref TaskSizeOverhead, you can
+run the script \c tasks_size_overhead.sh to get an
 idea of the scalability of tasks depending on their duration (in µs), on your
 own system.
 
@@ -40,19 +41,18 @@ much bigger than this.
 of cores, so it's better to try to get 10ms-ish tasks.
 
 Tasks durations can easily be observed when performance models are defined (see
-\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
-<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
+\ref PerformanceModelExample) by using the tools <c>starpu_perfmodel_plot</c> or
+<c>starpu_perfmodel_display</c> (see \ref PerformanceOfCodelets)
 
 When using parallel tasks, the problem is even worse since StarPU has to
-synchronize the execution of tasks.
+synchronize the tasks execution.
 
 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
 
-The \ref enable-fast "--enable-fast" \c configure option disables all
+The \c configure option \ref enable-fast "--enable-fast" disables all
 assertions. This makes StarPU more performant for really small tasks by
 disabling all sanity checks. Only use this for measurements and production, not for development, since this will drop all basic checks.
 
-
 \section DataRelatedFeaturesToImprovePerformance Data Related Features Which May Improve Performance
 
 link to \ref DataManagement
@@ -81,14 +81,14 @@ link to \ref StaticScheduling
 
 For proper overlapping of asynchronous GPU data transfers, data has to be pinned
 by CUDA. Data allocated with starpu_malloc() is always properly pinned. If the
-application is registering to StarPU some data which has not been allocated with
-starpu_malloc(), it should use starpu_memory_pin() to pin it.
+application registers to StarPU some data which has not been allocated with
+starpu_malloc(), starpu_memory_pin() should be called to pin the data memory.
 
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 communications and the codelet computations if the application does not use a
 dedicated CUDA stream for its computations instead of the default stream,
-which synchronizes all operations of the GPU. StarPU provides one by the use
-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
+which synchronizes all operations of the GPU. The function
+starpu_cuda_get_local_stream() returns a stream which can be used by all CUDA codelet
 operations to avoid this issue. For instance:
 
 \code{.c}
@@ -105,11 +105,11 @@ If some CUDA calls are made without specifying this local stream,
 synchronization needs to be explicited with cudaThreadSynchronize() around these
 calls, to make sure that they get properly synchronized with the calls using
 the local stream. Notably, \c cudaMemcpy() and \c cudaMemset() are actually
-asynchronous and need such explicit synchronization! Use cudaMemcpyAsync() and
-cudaMemsetAsync() instead.
+asynchronous and need such explicit synchronization! Use \c cudaMemcpyAsync() and
+\c cudaMemsetAsync() instead.
 
-Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
-CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
+Calling starpu_cublas_init() will ensure StarPU to properly call the
+CUBLAS library functions. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(</c>starpu_cuda_get_local_stream()<c>)</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
 stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
@@ -147,14 +147,14 @@ triggered by the completion of the kernel.
 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
-number of kernels to execute concurrently.  This is useful when kernels are
+number of kernels to be executed concurrently.  This is useful when kernels are
 small and do not feed the whole GPU with threads to run.
 
-Concerning memory allocation, you should really not use \c cudaMalloc/ \c cudaFree
-within the kernel, since \c cudaFree introduces a awfully lot of synchronizations
+Concerning memory allocation, you should really not use \c cudaMalloc()/ \c cudaFree()
+within the kernel, since \c cudaFree() introduces a awfully lot of synchronizations
 within CUDA itself. You should instead add a parameter to the codelet with the
 ::STARPU_SCRATCH mode access. You can then pass to the task a handle registered
-with the desired size but with the \c NULL pointer, that handle can even be the
+with the desired size but with the \c NULL pointer, the handle can even be
 shared between tasks, StarPU will allocate per-task data on the fly before task
 execution, and reuse the allocated data between tasks.
 
@@ -177,8 +177,8 @@ kernel startup and completion.
 
 It may happen that for some reason, StarPU does not make progress for a long
 period of time.  Reason are sometimes due to contention inside StarPU, but
-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
-driver, etc.
+sometimes this is due to external reasons, such as a stuck MPI or CUDA
+driver.
 
 <c>export STARPU_WATCHDOG_TIMEOUT=10000</c> (\ref STARPU_WATCHDOG_TIMEOUT)
 
@@ -187,30 +187,34 @@ any task for 10ms, but lets the application continue normally. In addition to th
 
 <c>export STARPU_WATCHDOG_CRASH=1</c> (\ref STARPU_WATCHDOG_CRASH)
 
-raises <c>SIGABRT</c> in this condition, thus allowing to catch the situation in gdb.
+raises <c>SIGABRT</c> in this condition, thus allowing to catch the
+situation in \c gdb.
+
 It can also be useful to type <c>handle SIGABRT nopass</c> in <c>gdb</c> to be able to let
 the process continue, after inspecting the state of the process.
 
 \section HowToLimitMemoryPerNode How to Limit Memory Used By StarPU And Cache Buffer Allocations
 
 By default, StarPU makes sure to use at most 90% of the memory of GPU devices,
-moving data in and out of the device as appropriate and with prefetch and
-writeback optimizations. Concerning the main memory, by default it will not
-limit its consumption, since by default it has nowhere to push the data to when
-memory gets tight. This also means that by default StarPU will not cache buffer
-allocations in main memory, since it does not know how much of the system memory
-it can afford.
-
-In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
-\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM environment variables
-can be used to control how
-much (in MiB) of the GPU device memory should be used at most by StarPU (their
-default values are 90% of the available memory).
-
-In the case of the main memory, the \ref STARPU_LIMIT_CPU_MEM environment
-variable can be used to specify how much (in MiB) of the main memory should be
-used at most by StarPU for buffer allocations. This way, StarPU will be able to
-cache buffer allocations (which can be a real benefit if a lot of bufferes are
+moving data in and out of the device as appropriate, as well as using
+prefetch and writeback optimizations.
+
+The environment variables \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM
+can be used to control how much (in MiB) of the GPU device memory
+should be used at most by StarPU (the default value is to use 90% of the
+available memory).
+
+By default, the usage of the main memory is not limited, as the
+default mechanims do not provide means to evict main memory when it
+gets too tight. This also means that by default StarPU will not cache buffer
+allocations in main memory, since it does not know how much of the
+system memory it can afford.
+
+The environment variable \ref STARPU_LIMIT_CPU_MEM can be used to
+specify how much (in MiB) of the main memory should be used at most by
+StarPU for buffer allocations. This way, StarPU will be able to
+cache buffer allocations (which can be a real benefit if a lot of buffers are
 involved, or if allocation fragmentation can become a problem), and when using
 \ref OutOfCore, StarPU will know when it should evict data out to the disk.
 
@@ -233,8 +237,8 @@ caches or data out to the disk, starpu_memory_allocate() can be used to
 specify an amount of memory to be accounted for. starpu_memory_deallocate()
 can be used to account freed memory back. Those can for instance be used by data
 interfaces with dynamic data buffers: instead of using starpu_malloc_on_node(),
-they would dynamically allocate data with malloc/realloc, and notify starpu of
-the delta thanks to starpu_memory_allocate() and starpu_memory_deallocate() calls.
+they would dynamically allocate data with \c malloc()/\c realloc(), and notify StarPU of
+the delta by calling starpu_memory_allocate() and starpu_memory_deallocate().
 
 starpu_memory_get_total() and starpu_memory_get_available()
 can be used to get an estimation of how much memory is available.
@@ -251,7 +255,7 @@ to reserve this amount immediately.
 
 It is possible to reduce the memory footprint of the task and data internal
 structures of StarPU by describing the shape of your machine and/or your
-application at the \c configure step.
+application when calling \c configure.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
 can set the
@@ -271,28 +275,27 @@ execution. For example, in the Cholesky factorization (dense linear algebra
 application), the GEMM task uses up to 3 buffers, so it is possible to set the
 maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 
-The size of the various structures of StarPU can be printed by 
+The size of the various structures of StarPU can be printed by
 <c>tests/microbenchs/display_structures_size</c>.
 
-It is also often useless to submit *all* the tasks at the same time. One can
-make the starpu_task_submit() function block when a reasonable given number of
-tasks have been submitted, by setting the \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS and
-\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
+It is also often useless to submit *all* the tasks at the same time.
+Task submission can be blocked when a reasonable given number of
+tasks have been submitted, by setting the environment variables \ref
+STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
 
 <c>
 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
-
 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
 </c>
 
-To make StarPU block submission when 10000 tasks are submitted, and unblock
+will make StarPU block submission when 10000 tasks are submitted, and unblock
 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
 completed among the 10000 which were submitted when submission was blocked. Of
 course this may reduce parallelism if the threshold is set too low. The precise
 balance depends on the application task graph.
 
 An idea of how much memory is used for tasks and data handles can be obtained by
-setting the \ref STARPU_MAX_MEMORY_USE environment variable to <c>1</c>.
+setting the environment variable \ref STARPU_MAX_MEMORY_USE to <c>1</c>.
 
 \section HowtoReuseMemory How To Reuse Memory
 
@@ -303,7 +306,7 @@ tasks. For this system to work with MPI tasks, you need to submit tasks progress
 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
 buffers will be done at submission time, not at execution time.
 
-You have two options to control the task submission flow. The first one is by
+There is two options to control the task submission flow. The first one is by
 controlling the number of submitted tasks during the whole execution. This can
 be done whether by setting the environment variables
 \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
@@ -348,11 +351,12 @@ To force continuing calibration,
 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
-made on each architecture, to avoid badly scheduling tasks just because the
+made on each architecture, to avoid bad scheduling decisions just because the
 first measurements were not so good. Details on the current performance model status
-can be obtained from the tool <c>starpu_perfmodel_display</c>: the <c>-l</c>
-option lists the available performance models, and the <c>-s</c> option permits
-to choose the performance model to be displayed. The result looks like:
+can be obtained with the tool <c>starpu_perfmodel_display</c>: the
+option <c>-l</c> lists the available performance models, and the
+option <c>-s</c> allows to choose the performance model to be
+displayed. The result looks like:
 
 \verbatim
 $ starpu_perfmodel_display -s starpu_slu_lu_model_11
@@ -364,7 +368,7 @@ e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
 ...
 \endverbatim
 
-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+which shows that for the LU 11 kernel with a 1MiB matrix, the average
 execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
 8 samples. It is a good idea to check this before doing actual performance
 measurements.
@@ -373,7 +377,7 @@ A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
 
 \verbatim
 $ starpu_perfmodel_plot -s starpu_slu_lu_model_11
-4096 16384 65536 262144 1048576 4194304 
+4096 16384 65536 262144 1048576 4194304
 $ gnuplot starpu_starpu_slu_lu_model_11.gp
 $ gv starpu_starpu_slu_lu_model_11.eps
 \endverbatim
@@ -451,28 +455,29 @@ STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
 \section OverheadProfiling Overhead Profiling
 
 \ref OfflinePerformanceTools can already provide an idea of to what extent and
-which part of StarPU bring overhead on the execution time. To get a more precise
-analysis of the parts of StarPU which bring most overhead, <c>gprof</c> can be used.
+which part of StarPU brings an overhead on the execution time. To get a more precise
+analysis of which parts of StarPU bring the most overhead, <c>gprof</c> can be used.
 
 First, recompile and reinstall StarPU with <c>gprof</c> support:
 
 \code
-./configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
+../configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
 \endcode
 
 Make sure not to leave a dynamic version of StarPU in the target path: remove
 any remaining <c>libstarpu-*.so</c>
 
 Then relink your application with the static StarPU library, make sure that
-running <c>ldd</c> on your application does not mention any libstarpu
+running <c>ldd</c> on your application does not mention any \c libstarpu
 (i.e. it's really statically-linked).
 
 \code
 gcc test.c -o test $(pkg-config --cflags starpu-1.3) $(pkg-config --libs starpu-1.3)
 \endcode
 
-Now you can run your application, and a <c>gmon.out</c> file should appear in the
-current directory, you can process it by running <c>gprof</c> on your application:
+Now you can run your application, this will create a file
+<c>gmon.out</c> in the current directory, it can be processed by
+running <c>gprof</c> on your application:
 
 \code
 gprof ./test

+ 11 - 11
doc/doxygen/chapters/301_tasks.doxy

@@ -40,7 +40,7 @@ impact that has on the target machine.
 \section TaskSubmission Task Submission
 
 To let StarPU make online optimizations, tasks should be submitted
-asynchronously as much as possible. Ideally, all the tasks should be
+asynchronously as much as possible. Ideally, all tasks should be
 submitted, and mere calls to starpu_task_wait_for_all() or
 starpu_data_unregister() be done to wait for
 termination. StarPU will then be able to rework the whole schedule, overlap
@@ -52,7 +52,7 @@ By default, StarPU will consider the tasks in the order they are submitted by
 the application. If the application programmer knows that some tasks should
 be performed in priority (for instance because their output is needed by many
 other tasks and may thus be a bottleneck if not executed early
-enough), the field starpu_task::priority should be set to transmit the
+enough), the field starpu_task::priority should be set to provide the
 priority information to StarPU.
 
 \section TaskDependencies Task Dependencies
@@ -165,14 +165,14 @@ starpu_task_insert(&dummy_big_cl,
 \endcode
 
 The whole code for this complex data interface is available in the
-directory <c>examples/basic_examples/dynamic_handles.c</c>.
+file <c>examples/basic_examples/dynamic_handles.c</c>.
 
 \section SettingVariableDataHandlesForATask Setting a Variable Number Of Data Handles For a Task
 
-Normally, the number of data handles given to a task is fixed in the
-starpu_codelet::nbuffers codelet field. This field can however be set to
-\ref STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
-must be set, and the starpu_task::modes field (or starpu_task::dyn_modes field,
+Normally, the number of data handles given to a task is set with
+starpu_codelet::nbuffers. This field can however be set to
+\ref STARPU_VARIABLE_NBUFFERS, in which case starpu_task::nbuffers
+must be set, and starpu_task::modes (or starpu_task::dyn_modes,
 see \ref SettingManyDataHandlesForATask) should be used to specify the modes for
 the handles.
 
@@ -215,7 +215,7 @@ struct starpu_codelet cl =
 
 Schedulers which are multi-implementation aware (only <c>dmda</c> and
 <c>pheft</c> for now) will use the performance models of all the
-implementations it was given, and pick the one which seems to be the fastest.
+provided implementations, and pick the one which seems to be the fastest.
 
 \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
 
@@ -333,7 +333,7 @@ struct starpu_codelet cl =
 };
 \endcode
 
-Note: the most generic variant should be provided first, as some schedulers are
+Note that the most generic variant should be provided first, as some schedulers are
 not able to try the different variants.
 
 \section InsertTaskUtility Insert Task Utility
@@ -341,7 +341,7 @@ not able to try the different variants.
 StarPU provides the wrapper function starpu_task_insert() to ease
 the creation and submission of tasks.
 
-Here the implementation of the codelet:
+Here the implementation of a codelet:
 
 \code{.c}
 void func_cpu(void *descr[], void *_args)
@@ -477,7 +477,7 @@ ret = starpu_task_get_task_succs(task, sizeof(tasks)/sizeof(*tasks), tasks);
 \section ParallelTasks Parallel Tasks
 
 StarPU can leverage existing parallel computation libraries by the means of
-parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
+parallel tasks. A parallel task is a task which is run by a set of CPUs
 (called a parallel or combined worker) at the same time, by using an existing
 parallel CPU implementation of the computation to be achieved. This can also be
 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs

+ 7 - 2
doc/doxygen/chapters/310_data_management.doxy

@@ -22,11 +22,16 @@ TODO: intro which mentions consistency among other things
 
 \section DataInterface Data Interface
 
-StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
+StarPU provides several data interfaces for programmers to describe
+the data layout of their application. There are predefined interfaces
+already available in StarPU. Users can define new data interfaces as
+explained in \ref DefiningANewDataInterface. All functions provided by
+StarPU are documented in \ref API_Data_Interfaces. You will find a
+short list below.
 
 \subsection VariableDataInterface Variable Data Interface
 
-A variable is a given size byte element, typically a scalar. Here an
+A variable is a given-size byte element, typically a scalar. Here an
 example of how to register a variable data to StarPU by using
 starpu_variable_data_register().
 

+ 17 - 6
doc/doxygen/chapters/320_scheduling.doxy

@@ -100,14 +100,18 @@ become available, without taking priorities into account.
 The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
 into account data transfer time.
 
+The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
+except that it sorts tasks by priority order, which allows to become even closer
+to HEFT by respecting priorities after having made the scheduling decision (but
+it still schedules tasks in the order they become available).
+
 The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
 but it also privileges tasks whose data buffers are already available
 on the target device.
 
-The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmdar,
-except that it sorts tasks by priority order, which allows to become even closer
-to HEFT by respecting priorities after having made the scheduling decision (but
-it still schedules tasks in the order they become available).
+The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
+but for a given priority it will privilege tasks whose data buffers are already
+available on the target device.
 
 The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
 to dmdas, except that when scheduling a task, it takes into account its priority
@@ -204,12 +208,15 @@ pre-defined Modularized Schedulers :
 - Eager-based Schedulers (with/without prefetching : \c modular-eager ,
 \c modular-eager-prefetching) : \n
 Naive scheduler, which tries to map a task on the first available resource
-it finds.
+it finds. The prefecthing variant queues several tasks in advance to be able to
+do data prefetching. This may however degrade load balancing a bit.
 
 - Prio-based Schedulers (with/without prefetching :
-\c modular-prio, \c modular-prio-prefetching) : \n
+\c modular-prio, \c modular-prio-prefetching , \c modular-eager-prio) : \n
 Similar to Eager-Based Schedulers. Can handle tasks which have a defined
 priority and schedule them accordingly.
+The \c modular-eager-prio variant integrates the eager and priority queue in a
+single component. This allows it to do a better job at pushing tasks.
 
 - Random-based Schedulers (with/without prefetching: \c modular-random,
 \c modular-random-prio, \c modular-random-prefetching, \c
@@ -231,6 +238,10 @@ modular-heft-prio is similar to \c modular-heft, but only decides the memory
 node, not the exact worker, just pushing tasks to one central queue per memory
 node.
 
+- Heteroprio Scheduler: \n
+Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
+GPUs, then not-so-accelerated tasks to CPUs.
+
 To use one of these schedulers, one can set the environment variable \ref STARPU_SCHED.
 
 \section StaticScheduling Static Scheduling

File diff suppressed because it is too large
+ 454 - 0
doc/doxygen/chapters/370_online_performance_tools.doxy


+ 1 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -438,7 +438,7 @@ histogram of the codelet execution time distribution.
 
 More than just codelet performance, it is interesting to get statistics over all
 kinds of StarPU states (allocations, data transfers, etc.). This is particularly
-useful to check what may have gone wrong in the accurracy of the simgrid
+useful to check what may have gone wrong in the accurracy of the SimGrid
 simulation.
 
 This requires the <c>R</c> statistical tool, with the <c>plyr</c>,

+ 5 - 5
doc/doxygen/chapters/470_simgrid.doxy

@@ -23,14 +23,14 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.22.
+platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.23.
 Other versions may have compatibility issues. 3.17 notably does not build at
 all. MPI simulation does not work with version 3.22.
 
 \section Preparing Preparing Your Application For Simulation
 
 There are a few technical details which need to be handled for an application to
-be simulated through Simgrid.
+be simulated through SimGrid.
 
 If the application uses <c>gettimeofday</c> to make its
 performance measurements, the real time will be used, which will be bogus. To
@@ -38,19 +38,19 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 
 For some technical reason, the application's .c file which contains \c main() has
-to be recompiled with \c starpu_simgrid_wrap.h, which in the simgrid case will <c># define main()</c>
+to be recompiled with \c starpu_simgrid_wrap.h, which in the SimGrid case will <c># define main()</c>
 into <c>starpu_main()</c>, and it is \c libstarpu which will provide the real \c main() and
 will call the application's \c main().
 
 To be able to test with crazy data sizes, one may want to only allocate
 application data if the macro \c STARPU_SIMGRID is not defined.  Passing a <c>NULL</c> pointer to
 \c starpu_data_register functions is fine, data will never be read/written to by
-StarPU in Simgrid mode anyway.
+StarPU in SimGrid mode anyway.
 
 To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the starpu_codelet::cuda_funcs with \c (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
-provide it. StarPU will not actually run it in Simgrid mode anyway by default
+provide it. StarPU will not actually run it in SimGrid mode anyway by default
 (unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
 flags are set in the codelet)
 

+ 18 - 9
doc/doxygen/chapters/501_environment_variables.doxy

@@ -512,6 +512,15 @@ the coefficient to be applied to it before adding it to the computation part.
 Define the execution time penalty of a joule (\ref Energy-basedScheduling).
 </dd>
 
+<dt>STARPU_SCHED_READY</dt>
+<dd>
+\anchor STARPU_SCHED_READY
+\addindex __env__STARPU_SCHED_READY
+For a modular scheduler with sorted queues below the decision component, workers
+pick up a task which has most of its data already available. Setting this to 0
+disables this.
+</dd>
+
 <dt>STARPU_IDLE_POWER</dt>
 <dd>
 \anchor STARPU_IDLE_POWER
@@ -647,7 +656,7 @@ STARPU_MPI_DRIVER_CALL_FREQUENCY environment variable set to a positive value.
 \anchor STARPU_SIMGRID_TRANSFER_COST
 \addindex __env__STARPU_SIMGRID_TRANSFER_COST
 When set to 1 (which is the default), data transfers (over PCI bus, typically) are taken into account
-in simgrid mode.
+in SimGrid mode.
 </dd>
 
 <dt>STARPU_SIMGRID_CUDA_MALLOC_COST</dt>
@@ -655,7 +664,7 @@ in simgrid mode.
 \anchor STARPU_SIMGRID_CUDA_MALLOC_COST
 \addindex __env__STARPU_SIMGRID_CUDA_MALLOC_COST
 When set to 1 (which is the default), CUDA malloc costs are taken into account
-in simgrid mode.
+in SimGrid mode.
 </dd>
 
 <dt>STARPU_SIMGRID_CUDA_QUEUE_COST</dt>
@@ -663,14 +672,14 @@ in simgrid mode.
 \anchor STARPU_SIMGRID_CUDA_QUEUE_COST
 \addindex __env__STARPU_SIMGRID_CUDA_QUEUE_COST
 When set to 1 (which is the default), CUDA task and transfer queueing costs are
-taken into account in simgrid mode.
+taken into account in SimGrid mode.
 </dd>
 
 <dt>STARPU_PCI_FLAT</dt>
 <dd>
 \anchor STARPU_PCI_FLAT
 \addindex __env__STARPU_PCI_FLAT
-When unset or set to 0, the platform file created for simgrid will
+When unset or set to 0, the platform file created for SimGrid will
 contain PCI bandwidths and routes.
 </dd>
 
@@ -678,7 +687,7 @@ contain PCI bandwidths and routes.
 <dd>
 \anchor STARPU_SIMGRID_QUEUE_MALLOC_COST
 \addindex __env__STARPU_SIMGRID_QUEUE_MALLOC_COST
-When unset or set to 1, simulate within simgrid the GPU transfer queueing.
+When unset or set to 1, simulate within SimGrid the GPU transfer queueing.
 </dd>
 
 <dt>STARPU_MALLOC_SIMULATION_FOLD</dt>
@@ -695,7 +704,7 @@ MiB. The default is 1, thus allowing 64GiB virtual memory when Linux's
 \anchor STARPU_SIMGRID_TASK_SUBMIT_COST
 \addindex __env__STARPU_SIMGRID_TASK_SUBMIT_COST
 When set to 1 (which is the default), task submission costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 especially for the beginning of the execution.
 </dd>
 
@@ -704,7 +713,7 @@ especially for the beginning of the execution.
 \anchor STARPU_SIMGRID_FETCHING_INPUT_COST
 \addindex __env__STARPU_SIMGRID_FETCHING_INPUT_COST
 When set to 1 (which is the default), fetching input costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 especially regarding data transfers.
 </dd>
 
@@ -713,7 +722,7 @@ especially regarding data transfers.
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
 When set to 1 (0 is the default), scheduling costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 and allows studying scheduling overhead of the runtime system. However,
 it also makes simulation non-deterministic.
 </dd>
@@ -1174,7 +1183,7 @@ average.
 \addindex __env__STARPU_RAND_SEED
 The random scheduler and some examples use random numbers for their own
 working. Depending on the examples, the seed is by default juste always 0 or
-the current time() (unless simgrid mode is enabled, in which case it is always
+the current time() (unless SimGrid mode is enabled, in which case it is always
 0). \ref STARPU_RAND_SEED allows to set the seed to a specific value.
 </dd>
 

+ 5 - 5
doc/doxygen/chapters/510_configure_options.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
- * Copyright (C) 2010-2017, 2019                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -281,7 +281,7 @@ contain the OpenCL shared libraries---e.g. <c>libOpenCL.so</c>. This defaults to
 \addindex __configure__--enable-opencl-simulator
 Enable considering the provided OpenCL implementation as a simulator, i.e. use
 the kernel duration returned by OpenCL profiling information as wallclock time
-instead of the actual measured real time. This requires simgrid support.
+instead of the actual measured real time. This requires the SimGrid support.
 </dd>
 
 <dt>--enable-maximplementations=<c>count</c></dt>
@@ -679,10 +679,10 @@ Enable memory statistics (\ref MemoryFeedback).
 <dd>
 \anchor enable-simgrid
 \addindex __configure__--enable-simgrid
-Enable simulation of execution in simgrid, to allow easy experimentation with
+Enable simulation of execution in SimGrid, to allow easy experimentation with
 various numbers of cores and GPUs, or amount of memory, etc. Experimental.
 
-The path to simgrid can be specified through the <c>SIMGRID_CFLAGS</c> and
+The path to SimGrid can be specified through the <c>SIMGRID_CFLAGS</c> and
 <c>SIMGRID_LIBS</c> environment variables, for instance:
 
 \verbatim
@@ -727,7 +727,7 @@ Use the smpirun at <c>path</c>
 <dd>
 \anchor enable-simgrid-mc
 \addindex __configure__--enable-simgrid-mc
-Enable the Model Checker in simulation of execution in simgrid, to allow
+Enable the Model Checker in simulation of execution in SimGrid, to allow
 exploring various execution paths.
 </dd>
 

+ 7 - 0
doc/doxygen/chapters/520_files.doxy

@@ -25,6 +25,7 @@
 \file starpu_bound.h
 \file starpu_clusters.h
 \file starpu_cublas.h
+\file starpu_cublas_v2.h
 \file starpu_cusparse.h
 \file starpu_cuda.h
 \file starpu_data_filters.h
@@ -36,10 +37,15 @@
 \file starpu_expert.h
 \file starpu_fxt.h
 \file starpu_hash.h
+\file starpu_helper.h
+\file starpu_heteroprio.h
 \file starpu_mic.h
+\file starpu_mpi_ms.h
 \file starpu_mod.f90
 \file starpu_opencl.h
 \file starpu_openmp.h
+\file starpu_perf_monitoring.h
+\file starpu_perf_steering.h
 \file starpu_perfmodel.h
 \file starpu_profiling.h
 \file starpu_rand.h
@@ -52,6 +58,7 @@
 \file starpu_stdlib.h
 \file starpu_task_bundle.h
 \file starpu_task.h
+\file starpu_task_dep.h
 \file starpu_task_list.h
 \file starpu_task_util.h
 \file starpu_thread.h

+ 4 - 4
doc/doxygen/chapters/api/threads.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  *
@@ -20,7 +20,7 @@
 
 \brief This section describes the thread facilities provided
 by StarPU. The thread function are either implemented on top of the
-pthread library or the Simgrid library when the simulated performance
+pthread library or the SimGrid library when the simulated performance
 mode is enabled (\ref SimGridSupport).
 
 \def STARPU_PTHREAD_CREATE_ON
@@ -359,8 +359,8 @@ todo
 \fn void starpu_sleep(float nb_sec)
 \ingroup API_Threads
 Similar to calling Unix' \c sleep function, except that it takes a float
-to allow sub-second sleeping, and when StarPU is compiled in simgrid mode it
-does not really sleep but just makes simgrid record that the thread has taken
+to allow sub-second sleeping, and when StarPU is compiled in SimGrid mode it
+does not really sleep but just makes SimGrid record that the thread has taken
 some time to sleep.
 
 */

+ 48 - 14
doc/doxygen/dev/checkDoc.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2013,2014,2016,2017                      CNRS
+# Copyright (C) 2013,2014,2016,2017,2019                      CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,21 +16,55 @@
 #
 dirname=$(dirname $0)
 
-x=$(grep ingroup $dirname/../chapters/api/*.doxy $dirname/../chapters/api/sc_hypervisor/*.doxy |awk -F':' '{print $2}'| awk 'NF != 2')
-if test -n "$x" ; then
-    echo Errors on group definitions
-    echo $x
-fi
-
-echo
+DIRS="$dirname/../../../include $dirname/../../../mpi/include $dirname/../../../starpurm/include $dirname/../../../sc_hypervisor/include"
 echo "Defined groups"
-grep ingroup $dirname/../chapters/api/*.doxy $dirname/../chapters/api/sc_hypervisor/*.doxy|awk -F':' '{print $2}'| awk 'NF == 2'|sort|uniq
+groups=""
+for d in $DIRS
+do
+    echo Checking $d
+    gg=$(grep -rs defgroup $d | awk '{print $3}')
+    echo $gg
+    groups=$(echo $groups $gg)
+done
+for g in $groups
+do
+    gg=$(echo $g | sed 's/_/__/g')
+    x=$(grep $gg $dirname/../refman.tex)
+    if test -z "$x"
+    then
+	echo "Error. Group $g not included in refman.tex"
+    fi
+done
 echo
 
-for f in $dirname/../../../build/doc/doxygen/latex/*tex ; do
-    x=$(grep $(basename $f .tex) $dirname/../refman.tex)
-    if test -z "$x" ; then
-	echo Error. $f not included in refman.tex
-    fi
+for d in $DIRS
+do
+    for f in $(find $d -name "*.h")
+    do
+	ff=$(echo $f  | awk -F'/' '{print $NF}')
+	x=$(grep $ff $dirname/../doxygen-config.cfg.in)
+	if test -z "$x"
+	then
+	    echo Error. $f not included in doxygen-config.cfg.in
+	fi
+	x=$(grep $ff $dirname/../chapters/520_files.doxy)
+	if test -z "$x"
+	then
+	    echo Error. $f not included in 520_files.doxy
+	fi
+    done
+done
+echo
+
+for p in starpu sc__hypervisor
+do
+    for f in $dirname/../../../build/doc/doxygen/latex/${p}*tex
+    do
+	x=$(grep $(basename $f .tex) $dirname/../refman.tex)
+	if test -z "$x"
+	then
+	    echo Error. $f not included in refman.tex
+	fi
+    done
 done
 

+ 0 - 28
doc/doxygen/dev/sc_funcs.cocci

@@ -1,28 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2014,2015,2017                           CNRS
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-@scfunc@
-position p;
-type t;
-identifier f =~ "sc";
-@@
-
-t f@p( ... );
-
-@ script:python @
-p << scfunc.p;
-f << scfunc.f;
-@@
-print "%s,%s:%s" % (f,p[0].file,p[0].line)

+ 23 - 28
doc/doxygen/dev/starpu_check_refs.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2016-2018                                CNRS
+# Copyright (C) 2016-2019                                CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -20,10 +20,6 @@ greencolor=$(tput setaf 2)
 
 dirname=$(dirname $0)
 
-STARPU_H_FILES=$(find $dirname/../../../include $dirname/../../../mpi/include -name '*.h')
-SC_H_FILES=$(find $dirname/../../../sc_hypervisor/include -name '*.h')
-SRC="$dirname/../../../src $dirname/../../../mpi/src $dirname/../../../sc_hypervisor/src"
-
 #grep --exclude-dir=.git --binary-files=without-match -rsF "\ref" $dirname/../chapters|grep -v "\\ref [a-zA-Z]"
 #echo continue && read
 
@@ -32,36 +28,35 @@ GREP="grep --exclude-dir=.git --binary-files=without-match -rsF"
 REFS=$($GREP "\ref" $dirname/../chapters| tr ':' '\012' | tr '.' '\012'  | tr ',' '\012'  | tr '(' '\012' | tr ')' '\012' | tr ' ' '\012'|grep -F '\ref' -A1 | grep -v '^--$' | sed 's/\\ref/=\\ref/' | tr '\012' ':' | tr '=' '\012' | sort | uniq)
 find $dirname/../chapters -name "*doxy" -exec cat {} \; > /tmp/DOXYGEN_$$
 cat $dirname/../refman.tex >> /tmp/DOXYGEN_$$
+find $dirname/../../../include -name "*h" -exec cat {} \; >> /tmp/DOXYGEN_$$
+find $dirname/../../../starpurm/include -name "*h" -exec cat {} \; >> /tmp/DOXYGEN_$$
+find $dirname/../../../mpi/include -name "*h" -exec cat {} \; >> /tmp/DOXYGEN_$$
+find $dirname/../../../sc_hypervisor/include -name "*h" -exec cat {} \; >> /tmp/DOXYGEN_$$
+
+stcolor=$(tput sgr0)
+redcolor=$(tput setaf 1)
+greencolor=$(tput setaf 2)
 
 for r in $REFS
 do
     ref=$(echo $r | sed 's/\\ref:\(.*\):/\1/')
-    n=$($GREP -crs "section $ref" /tmp/DOXYGEN_$$)
-    if test $n -eq 0
+    if test -n "$ref"
     then
-	n=$($GREP -crs "anchor $ref" /tmp/DOXYGEN_$$)
-	if test $n -eq 0
-	then
-	    n=$($GREP -crs "ingroup $ref" /tmp/DOXYGEN_$$)
-	    if test $n -eq 0
+	#echo "ref $ref"
+	for keyword in "section " "anchor " "ingroup " "defgroup " "def " "struct " "label{"
+	do
+	    n=$($GREP -crs "${keyword}${ref}" /tmp/DOXYGEN_$$)
+	    if test $n -ne 0
 	    then
-		n=$($GREP -crs "def $ref" /tmp/DOXYGEN_$$)
-		if test $n -eq 0
-		then
-		    n=$($GREP -crs "struct $ref" /tmp/DOXYGEN_$$)
-		    if test $n -eq 0
-		    then
-			if test $n -eq 0
-			then
-			    n=$($GREP -crs "label{$ref" /tmp/DOXYGEN_$$)
-			    if test $n -eq 0
-			    then
-				echo $ref missing
-			    fi
-			fi
-		    fi
-		fi
+		break
 	    fi
+	done
+	if test $n -eq 0
+	then
+	    echo "${redcolor}$ref${stcolor} is missing"
+	else
+	    true
+	    #echo "${greencolor}$ref${stcolor} is ok"
 	fi
     fi
 done

+ 63 - 35
doc/doxygen/dev/starpu_check_undocumented.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011-2018                                CNRS
+# Copyright (C) 2011-2019                                CNRS
 # Copyright (C) 2011                                     Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,8 +15,6 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-# Note: expects Coccinelle's spatch command n the PATH
-# See: http://coccinelle.lip6.fr/
 
 stcolor=$(tput sgr0)
 redcolor=$(tput setaf 1)
@@ -40,52 +38,81 @@ else
     fi
 fi
 
-if [ "$1" == "--func" ] || [ "$1" == "" ] ; then
-    starpu_functions=$(spatch -very_quiet -sp_file $dirname/starpu_funcs.cocci $STARPU_H_FILES)
-    sc_functions=$(spatch -very_quiet -sp_file $dirname/sc_funcs.cocci $SC_H_FILES)
-    for func in $starpu_functions $sc_functions ; do
-	fname=$(echo $func|awk -F ',' '{print $1}')
-	location=$(echo $func|awk -F ',' '{print $2}')
-	x=$(grep "$fname(" $dirname/../chapters/api/*.doxy | grep "\\fn")
-	if test "$x" == "" ; then
-	    echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
-	    #	else
-	    #		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
-	fi
+ok()
+{
+    type=$1
+    name=$2
+    echo "$type ${greencolor}${name}${stcolor} is (maybe correctly) documented"
+}
+
+ko()
+{
+    type=$1
+    name=$2
+    echo "$type ${redcolor}${name}${stcolor} is not (or incorrectly) documented"
+}
+
+if [ "$1" == "--func" ] || [ "$1" == "" ]
+then
+    for f in $STARPU_H_FILES $SC_H_FILES
+    do
+	grep "(" $f | grep ';' | grep starpu | grep '^[a-z]' | grep -v typedef | grep -v '(\*' | while read line
+	do
+	    x=$(grep -F -B1 "$line" $f | head -1)
+	    fname=$(echo $line | awk -F'(' '{print $1}' | awk '{print $NF}' | tr -d '*')
+	    if test "$x" == '*/'
+	    then
+		ok function $fname
+	    else
+		#echo $line
+		ko function $fname
+	    fi
+	done
     done
-    echo
 fi
 
 if [ "$1" == "--struct" ] || [ "$1" == "" ] ; then
-    starpu_structs=$(grep "struct starpu" $STARPU_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-    sc_structs=$(grep "struct sc" $SC_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-    for struct in $starpu_structs $sc_structs ; do
-	x=$(grep -F "\\struct $struct" $dirname/../chapters/api/*.doxy)
-	if test "$x" == "" ; then
-	    echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
+    starpu=$(grep "^struct starpu_[a-z_]*$" $STARPU_H_FILES | awk '{print $NF}')
+    sc=$(grep "^struct sc_[a-z_]*$" $SC_H_FILES | awk '{print $NF}')
+    for o in $starpu $sc ; do
+	hfile=$(grep -l "^struct ${o}$" $STARPU_H_FILES $SC_H_FILES)
+	x=$(grep -B1 "^struct ${o}$" $hfile | head -1)
+	if test "$x" == '*/'
+	then
+	    ok "struct" ${o}
+	else
+	    ko "struct" ${o}
 	fi
     done
     echo
 fi
 
 if [ "$1" == "--enum" ] || [ "$1" == "" ] ; then
-    starpu_enums=$(grep "enum starpu" $STARPU_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-    sc_enums=$(grep "enum starpu" $SC_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-    for enum in $starpu_enums $sc_enums ; do
-	x=$(grep -F "\\enum $enum" $dirname/../chapters/api/*.doxy)
-	if test "$x" == "" ; then
-	    echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
+    starpu=$(grep "^enum starpu_[a-z_]*$" $STARPU_H_FILES | awk '{print $NF}')
+    sc=$(grep "^enum sc_[a-z_]*$" $SC_H_FILES | awk '{print $NF}')
+    for o in $starpu $sc ; do
+	hfile=$(grep -l "^enum ${o}$" $STARPU_H_FILES $SC_H_FILES)
+	x=$(grep -B1 "^enum ${o}$" $hfile | head -1)
+	if test "$x" == '*/'
+	then
+	    ok "enum" ${o}
+	else
+	    ko "enum" ${o}
 	fi
     done
     echo
 fi
 
 if [ "$1" == "--macro" ] || [ "$1" == "" ] ; then
-    macros=$(grep "define\b" $STARPU_H_FILES $SC_H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
-    for macro in $macros ; do
-	x=$(grep -F "\\def $macro" $dirname/../chapters/api/*.doxy)
-	if test "$x" == "" ; then
-	    echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
+    macros=$(grep "define\b" $STARPU_H_FILES $SC_H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | grep -i starpu | sort|uniq)
+    for o in $macros ; do
+	hfile=$(grep -l "define\b ${o}" $STARPU_H_FILES $SC_H_FILES)
+	x=$(grep -B1 "define\b ${o}" $hfile | head -1)
+	if test "$x" == '*/'
+	then
+	    ok "define" ${o}
+	else
+	    ko "define" ${o}
 	fi
     done
     echo
@@ -96,8 +123,9 @@ if [ "$1" == "--var" ] || [ "$1" == "" ] ; then
     for variable in $variables ; do
 	x=$(grep "$variable" $dirname/../chapters/501_environment_variables.doxy | grep "\\anchor")
 	if test "$x" == "" ; then
-	    echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
+	    ko "variable" $variable
+	else
+	    ok "variable" $variable
 	fi
     done
 fi
-

+ 0 - 28
doc/doxygen/dev/starpu_funcs.cocci

@@ -1,28 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2013,2015,2017                           CNRS
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-@starpufunc@
-position p;
-type t;
-identifier f =~ "starpu";
-@@
-
-t f@p( ... );
-
-@ script:python @
-p << starpufunc.p;
-f << starpufunc.f;
-@@
-print "%s,%s:%s" % (f,p[0].file,p[0].line)

+ 11 - 1
doc/doxygen/doxygen-config.cfg.in

@@ -26,6 +26,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 	 		 @top_srcdir@/include/starpu_clusters.h \
 			 @top_srcdir@/include/starpu_cusparse.h \
 			 @top_srcdir@/include/starpu_cublas.h \
+			 @top_srcdir@/include/starpu_cublas_v2.h \
 			 @top_srcdir@/include/starpu_cuda.h \
 			 @top_srcdir@/include/starpu_data_filters.h \
 			 @top_srcdir@/include/starpu_data.h \
@@ -39,9 +40,12 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_hash.h \
 			 @top_srcdir@/include/starpu_helper.h \
 			 @top_srcdir@/include/starpu_mic.h \
+			 @top_srcdir@/include/starpu_mpi_ms.h \
 			 @top_srcdir@/include/starpu_mod.f90 \
 			 @top_srcdir@/include/starpu_opencl.h \
 			 @top_srcdir@/include/starpu_openmp.h \
+			 @top_srcdir@/include/starpu_perf_monitoring.h \
+			 @top_srcdir@/include/starpu_perf_steering.h \
 			 @top_srcdir@/include/starpu_perfmodel.h \
 			 @top_srcdir@/include/starpu_profiling.h \
 			 @top_srcdir@/include/starpu_rand.h \
@@ -69,7 +73,13 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/mpi/include/fstarpu_mpi_mod.f90 \
 			 @top_srcdir@/starpufft/include/starpufft.h \
 			 @top_srcdir@/sc_hypervisor/include \
-			 @top_srcdir@/starpurm/include/starpurm.h
+			 @top_srcdir@/sc_hypervisor/include/sc_hypervisor_config.h \
+			 @top_srcdir@/sc_hypervisor/include/sc_hypervisor_policy.h \
+			 @top_srcdir@/sc_hypervisor/include/sc_hypervisor_lp.h  \
+			 @top_srcdir@/sc_hypervisor/include/sc_hypervisor.h \
+			 @top_srcdir@/sc_hypervisor/include/sc_hypervisor_monitoring.h \
+			 @top_srcdir@/starpurm/include/starpurm.h \
+			 @top_srcdir@/include/schedulers/starpu_heteroprio.h
 
 EXAMPLE_PATH           = @top_srcdir@/doc/doxygen \
 		       	 @top_srcdir@/doc/doxygen/chapters \

+ 1 - 0
doc/doxygen/doxygen.cfg

@@ -1627,6 +1627,7 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_MKL=1 \
 			 STARPU_WORKER_CALLBACKS=1 \
 			 STARPU_HAVE_GLPK_H=1 \
+			 STARPU_USE_MPI_MASTER_SLAVE=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 14 - 2
doc/doxygen/refman.tex

@@ -37,7 +37,7 @@ Generated by Doxygen.
 This manual documents the usage of StarPU version \STARPUVERSION. Its contents
 was last updated on \STARPUUPDATED.\\
 
-Copyright © 2009–2018 Université de Bordeaux\\
+Copyright © 2009–2018 Université de Bordeaux
 
 Copyright © 2010-2018 CNRS
 
@@ -235,6 +235,8 @@ Documentation License”.
 \input{group__API__Codelet__And__Tasks}
 \input{group__API__Insert__Task}
 \input{group__API__Explicit__Dependencies}
+\input{group__API__Perf__Monitoring}
+\input{group__API__Perf__Steering}
 \input{group__API__Performance__Model}
 \input{group__API__Profiling}
 \input{group__API__Theoretical__Lower__Bound__on__Execution__Time}
@@ -260,6 +262,9 @@ Documentation License”.
 \input{group__API__Modularized__Scheduler}
 \input{group__API__Clustering__Machine}
 \input{group__API__Interop__Support}
+\input{group__API__Master__Slave}
+\input{group__API__Random__Functions}
+\input{group__API__Sink}
 
 \chapter{File Index}
 \input{files}
@@ -274,6 +279,7 @@ Documentation License”.
 \input{starpu__clusters_8h}
 \input{starpu__config_8h}
 \input{starpu__cublas_8h}
+\input{starpu__cublas__v2_8h}
 \input{starpu__cusparse_8h}
 \input{starpu__cuda_8h}
 \input{starpu__data_8h}
@@ -285,12 +291,17 @@ Documentation License”.
 \input{starpu__expert_8h}
 \input{starpu__fxt_8h}
 \input{starpu__hash_8h}
+\input{starpu__helper_8h}
+\input{starpu__heteroprio_8h}
 \input{starpu__mic_8h}
 \input{starpu__mod_8f90}
 \input{starpu__mpi_8h}
 \input{starpu__mpi__lb_8h}
+\input{starpu__mpi__ms_8h}
 \input{starpu__opencl_8h}
 \input{starpu__openmp_8h}
+\input{starpu__perf__monitoring_8h}
+\input{starpu__perf__steering_8h}
 \input{starpu__perfmodel_8h}
 \input{starpu__profiling_8h}
 \input{starpu__rand_8h}
@@ -303,6 +314,7 @@ Documentation License”.
 \input{starpu__stdlib_8h}
 \input{starpu__task_8h}
 \input{starpu__task__bundle_8h}
+\input{starpu__task__dep_8h}
 \input{starpu__task__list_8h}
 \input{starpu__task__util_8h}
 \input{starpu__thread_8h}
@@ -335,7 +347,7 @@ Documentation License”.
 \hypertarget{GNUFreeDocumentationLicense}{}
 \input{GNUFreeDocumentationLicense}
 
-\part{Index}
+%\part{Index}
 \addcontentsline{toc}{chapter}{Index}
 \printindex
 

+ 5 - 0
examples/Makefile.am

@@ -244,6 +244,11 @@ STARPU_EXAMPLES +=				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
+	perf_monitoring/perf_counters_01	\
+	perf_monitoring/perf_counters_02	\
+	perf_steering/perf_knobs_01		\
+	perf_steering/perf_knobs_02		\
+	perf_steering/perf_knobs_03		\
 	scheduler/heteroprio_test		\
 	sched_ctx/sched_ctx			\
 	sched_ctx/sched_ctx_empty		\

+ 3 - 3
examples/callback/prologue.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013,2014                                Inria
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2009,2010,2013-2015                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -92,9 +92,9 @@ int main(void)
 	ret = starpu_task_insert(&cl,
 				 STARPU_RW, handle,
 				 STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
-				 STARPU_PROLOGUE_CALLBACK_ARG, &x,
+				 STARPU_PROLOGUE_CALLBACK_ARG_NFREE, &x,
 				 STARPU_PROLOGUE_CALLBACK_POP, pop_prologue_callback_func,
-				 STARPU_PROLOGUE_CALLBACK_POP_ARG, 5,
+				 STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE, 5,
 				 0);
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");

+ 4 - 3
examples/cholesky/cholesky.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2018                                     Université de Bordeaux
+# Copyright (C) 2018-2019                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,8 +16,9 @@
 #
 
 ROOT=${0%.sh}
+[ -z "$STARPU_SCHED" ] || STARPU_SCHEDS="$STARPU_SCHED"
 #[ -n "$STARPU_SCHEDS" ] || STARPU_SCHEDS=`$(dirname $0)/../../tools/starpu_sched_display`
-[ -n "$STARPU_SCHEDS" ] || STARPU_SCHEDS="dmdas modular-heft modular-heft-prio dmdar dmda dmdasd prio lws"
+[ -n "$STARPU_SCHEDS" ] || STARPU_SCHEDS="dmdas modular-heft2 modular-heft modular-heft-prio modular-heteroprio dmdap dmdar dmda dmdasd prio lws"
 [ -n "$STARPU_HOSTNAME" ] || export STARPU_HOSTNAME=mirage
 unset MALLOC_PERTURB_
 
@@ -61,5 +62,5 @@ do
 	COMMA=", "
 done
 gnuplot cholesky.gp
-gv $OUTFILE
+#gv $OUTFILE
 true

+ 35 - 0
examples/cpp/Makefile_add_vectors.mk

@@ -0,0 +1,35 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016                                     Inria
+# Copyright (C) 2017                                     CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+PROG = add_vectors
+
+SRCCXX = add_vectors.cpp
+
+CXX = g++
+
+CXXFLAGS = -g -DPRINT_OUTPUT $(shell pkg-config --cflags starpu-1.3)
+LDLIBS =  $(shell pkg-config --libs starpu-1.3)
+
+OBJS = $(SRCCXX:%.cpp=%.o)
+
+.phony: all clean
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+
+clean:
+	rm -fv *.o $(PROG)

+ 35 - 0
examples/cpp/Makefile_add_vectors_cpp11.mk

@@ -0,0 +1,35 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016                                     Inria
+# Copyright (C) 2017                                     CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+PROG = add_vectors_cpp11
+
+SRCCXX = add_vectors_cpp11.cpp
+
+CXX = g++
+
+CXXFLAGS = -g -std=c++11 -DPRINT_OUTPUT $(shell pkg-config --cflags starpu-1.3)
+LDLIBS =  $(shell pkg-config --libs starpu-1.3)
+
+OBJS = $(SRCCXX:%.cpp=%.o)
+
+.phony: all clean
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(CXX) $(LDFLAGS) -o $@ $^ $(LDLIBS)
+
+clean:
+	rm -fv *.o $(PROG)

+ 672 - 0
examples/cpp/add_vectors_interface.cpp

@@ -0,0 +1,672 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012,2017                                Inria
+ * Copyright (C) 2010-2014,2016,2017,2019                 CNRS
+ * Copyright (C) 2009-2011,2013-2015,2017,2018-2019       Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This is a small example of a C++ program using STL and starpu.  We here just
+ * add two std::vector with duplicating vectors. StarPU achieves data
+ * transfers between objects.
+ */
+
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNU_MINOR < 9))
+int main(int argc, char **argv)
+{
+	return 77;
+}
+#else
+#include <cassert>
+#include <vector>
+
+#ifdef PRINT_OUTPUT
+#include <iostream>
+#endif
+
+#include <starpu.h>
+
+#define MY_TYPE char, my_allocator<char>
+
+/* create an allocator to put data on the correct NUMA node */
+template <class T>
+class my_allocator
+{
+	public:
+
+	typedef size_t    size_type;
+	typedef ptrdiff_t difference_type;
+	typedef T*        pointer;
+	typedef const T*  const_pointer;
+	typedef T&        reference;
+	typedef const T&  const_reference;
+	typedef T         value_type;
+
+	my_allocator()
+	{
+		this->node = STARPU_MAIN_RAM;
+	}
+
+	my_allocator(const my_allocator& a)
+	{
+		node = a.get_node();
+	}
+
+	explicit my_allocator(const unsigned node)
+	{
+		this->node = node;
+	}
+
+	pointer allocate(size_type n, const void * = 0)
+	{
+		T* t = (T*) starpu_malloc_on_node(this->node, n * sizeof(T));
+		return t;
+	}
+
+	void      deallocate(void* p, size_type n)
+	{
+		if (p)
+		{
+			starpu_free_on_node(this->node, (uintptr_t) p, n * sizeof(T));
+		}
+	}
+
+	unsigned get_node() const
+	{
+		return node;
+	}
+
+	pointer address(reference x) const
+	{
+		return &x;
+	}
+
+	const_pointer address(const_reference x) const
+	{
+		return &x;
+	}
+
+	my_allocator<T>&  operator=(const my_allocator&ref)
+	{
+		node = ref.node;
+		return *this;
+	}
+
+	void construct(pointer p, const T& val)
+	{
+		new ((T*) p) T(val);
+	}
+
+	void destroy(pointer p)
+	{
+		p->~T();
+	}
+
+	size_type max_size() const
+	{
+		return size_type(-1);
+	}
+
+
+	template <class U>
+		struct rebind
+		{
+			typedef my_allocator<U> other;
+		};
+
+	template <class U>
+		explicit my_allocator(const my_allocator<U>&ref)
+		{
+			node = ref.node;
+		}
+
+	template <class U>
+		my_allocator<U>& operator=(const my_allocator<U>&ref)
+		{
+			node = ref.node;
+			return *this;
+		}
+
+	private:
+	unsigned node;
+};
+
+/*
+ * Create a new interface to catch C++ vector and make appropriate data transfers
+ */
+struct vector_cpp_interface
+{
+	enum starpu_data_interface_id id;
+
+	uintptr_t ptr;
+	uintptr_t dev_handle;
+	size_t offset;
+	uint32_t nx;
+	size_t elemsize;
+	std::vector<MY_TYPE>* vec;
+
+	uint32_t slice_base;
+};
+
+#define VECTOR_CPP_GET_VEC(interface)	({ (((struct vector_cpp_interface *)(interface))->vec); })
+
+static int vector_interface_copy_any_to_any(void *src_interface, unsigned src_node,
+                           void *dst_interface, unsigned dst_node, void *async_data);
+
+#if __cplusplus >= 201103L
+static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
+{
+
+	.can_copy = NULL,
+
+	.ram_to_ram = NULL,
+	.ram_to_cuda = NULL,
+	.ram_to_opencl = NULL,
+	.ram_to_mic = NULL,
+
+	.cuda_to_ram = NULL,
+	.cuda_to_cuda = NULL,
+	.cuda_to_opencl = NULL,
+
+	.opencl_to_ram = NULL,
+	.opencl_to_cuda = NULL,
+	.opencl_to_opencl = NULL,
+
+	.mic_to_ram = NULL,
+
+	.ram_to_mpi_ms = NULL,
+	.mpi_ms_to_ram = NULL,
+	.mpi_ms_to_mpi_ms = NULL,
+
+	.ram_to_cuda_async = NULL,
+	.cuda_to_ram_async = NULL,
+	.cuda_to_cuda_async = NULL,
+
+	.ram_to_opencl_async = NULL,
+	.opencl_to_ram_async = NULL,
+	.opencl_to_opencl_async = NULL,
+
+	.ram_to_mpi_ms_async = NULL,
+	.mpi_ms_to_ram_async = NULL,
+	.mpi_ms_to_mpi_ms_async = NULL,
+
+	.ram_to_mic_async = NULL,
+	.mic_to_ram_async = NULL,
+
+	.any_to_any = vector_interface_copy_any_to_any,
+};
+#else
+static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
+{
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+
+	vector_interface_copy_any_to_any,
+};
+#endif
+
+static void register_vector_cpp_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
+static starpu_ssize_t allocate_vector_cpp_buffer_on_node(void *data_interface_, unsigned dst_node);
+static void *vector_cpp_to_pointer(void *data_interface, unsigned node);
+static int vector_cpp_pointer_is_inside(void *data_interface, unsigned node, void *ptr);
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node);
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node);
+static size_t vector_cpp_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_vector_cpp_interface_crc32(starpu_data_handle_t handle);
+static int vector_cpp_compare(void *data_interface_a, void *data_interface_b);
+static void display_vector_cpp_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
+static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static starpu_ssize_t vector_cpp_describe(void *data_interface, char *buf, size_t size);
+
+#if __cplusplus >= 201103L
+static struct starpu_data_interface_ops interface_vector_cpp_ops =
+{
+	.register_data_handle = register_vector_cpp_handle,
+	.allocate_data_on_node = allocate_vector_cpp_buffer_on_node,
+	.free_data_on_node = free_vector_cpp_buffer_on_node,
+	.init = NULL,
+	.copy_methods = &vector_cpp_copy_data_methods_s,
+	.handle_to_pointer = NULL,
+	.to_pointer = vector_cpp_to_pointer,
+	.pointer_is_inside = vector_cpp_pointer_is_inside,
+	.get_size = vector_cpp_interface_get_size,
+	.get_alloc_size = NULL,
+	.get_max_size = NULL,
+	.footprint = footprint_vector_cpp_interface_crc32,
+	.alloc_footprint = NULL,
+	.compare = vector_cpp_compare,
+	.alloc_compare = NULL,
+	.display = display_vector_cpp_interface,
+	.describe = vector_cpp_describe,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+	.interface_size = sizeof(struct vector_cpp_interface),
+	.is_multiformat = 0,
+	.dontcache = 0,
+	.get_mf_ops = NULL,
+	.pack_data = pack_vector_cpp_handle,
+	.unpack_data = unpack_vector_cpp_handle,
+	.name = (char *) "VECTOR_CPP_INTERFACE"
+};
+#else
+static struct starpu_data_interface_ops interface_vector_cpp_ops =
+{
+	register_vector_cpp_handle,
+	allocate_vector_cpp_buffer_on_node,
+	free_vector_cpp_buffer_on_node,
+	NULL,
+	&vector_cpp_copy_data_methods_s,
+	vector_cpp_to_pointer,
+	vector_cpp_pointer_is_inside,
+	vector_cpp_interface_get_size,
+	NULL,
+	NULL,
+	footprint_vector_cpp_interface_crc32,
+	NULL,
+	vector_cpp_compare,
+	NULL,
+	display_vector_cpp_interface,
+	vector_cpp_describe,
+	STARPU_UNKNOWN_INTERFACE_ID,
+	sizeof(struct vector_cpp_interface),
+	0,
+	0,
+	NULL,
+	pack_vector_cpp_handle,
+	unpack_vector_cpp_handle,
+	(char *) "VECTOR_CPP_INTERFACE"
+};
+#endif
+
+static void *vector_cpp_to_pointer(void *data_interface, unsigned node)
+{
+	(void) node;
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	return (void*) vector_interface->ptr;
+}
+
+static int vector_cpp_pointer_is_inside(void *data_interface, unsigned int node, void *ptr)
+{
+	(void) node;
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	return (char*) ptr >= (char*) vector_interface->ptr &&
+		(char*) ptr < (char*) vector_interface->ptr + vector_interface->nx*vector_interface->elemsize;
+}
+
+static void register_vector_cpp_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct vector_cpp_interface *local_interface = (struct vector_cpp_interface *)
+			starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node)
+		{
+			local_interface->ptr = vector_interface->ptr;
+                        local_interface->dev_handle = vector_interface->dev_handle;
+                        local_interface->offset = vector_interface->offset;
+			local_interface->vec = vector_interface->vec;
+		}
+		else
+		{
+			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
+			local_interface->vec = NULL;
+		}
+
+		local_interface->id = vector_interface->id;
+		local_interface->nx = vector_interface->nx;
+		local_interface->elemsize = vector_interface->elemsize;
+		local_interface->slice_base = vector_interface->slice_base;
+	}
+}
+
+/* declare a new data with the vector interface */
+void vector_cpp_data_register(starpu_data_handle_t *handleptr, int home_node,
+                        std::vector<MY_TYPE>* vec, uint32_t nx, size_t elemsize)
+{
+#if __cplusplus >= 201103L
+	struct vector_cpp_interface vector =
+	{
+		.id = STARPU_UNKNOWN_INTERFACE_ID,
+		.ptr = (uintptr_t) &(*vec)[0],
+                .dev_handle = (uintptr_t) &(*vec)[0],
+                .offset = 0,
+		.nx = nx,
+		.elemsize = elemsize,
+		.vec = vec,
+		.slice_base = 0
+	};
+#else
+	struct vector_cpp_interface vector =
+	{
+		STARPU_UNKNOWN_INTERFACE_ID,
+		(uintptr_t) &(*vec)[0],
+                (uintptr_t) &(*vec)[0],
+                0,
+		nx,
+		elemsize,
+		vec,
+		0
+	};
+#endif
+
+	starpu_data_register(handleptr, home_node, &vector, &interface_vector_cpp_ops);
+}
+
+/* offer an access to the data parameters */
+uint32_t vector_cpp_get_nx(starpu_data_handle_t handle)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return vector_interface->nx;
+}
+
+
+static uint32_t footprint_vector_cpp_interface_crc32(starpu_data_handle_t handle)
+{
+	return starpu_hash_crc32c_be(vector_cpp_get_nx(handle), 0);
+}
+
+static int vector_cpp_compare(void *data_interface_a, void *data_interface_b)
+{
+	struct vector_cpp_interface *vector_a = (struct vector_cpp_interface *) data_interface_a;
+	struct vector_cpp_interface *vector_b = (struct vector_cpp_interface *) data_interface_b;
+
+	/* Two vectors are considered compatible if they have the same size */
+	return ((vector_a->nx == vector_b->nx)
+			&& (vector_a->elemsize == vector_b->elemsize));
+}
+
+static void display_vector_cpp_interface(starpu_data_handle_t handle, FILE *f)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	fprintf(f, "%u\t", vector_interface->nx);
+}
+
+static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = vector_interface->nx*vector_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		starpu_malloc_flags(ptr, *count, 0);
+		memcpy(*ptr, (void*)vector_interface->ptr, vector_interface->elemsize*vector_interface->nx);
+	}
+
+	return 0;
+}
+
+static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == vector_interface->elemsize * vector_interface->nx);
+	memcpy((void*)vector_interface->ptr, ptr, count);
+
+	return 0;
+}
+
+static size_t vector_cpp_interface_get_size(starpu_data_handle_t handle)
+{
+	size_t size;
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	size = vector_interface->nx*vector_interface->elemsize;
+
+	return size;
+}
+
+size_t vector_cpp_get_elemsize(starpu_data_handle_t handle)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return vector_interface->elemsize;
+}
+
+/* memory allocation/deallocation primitives for the vector interface */
+
+/* returns the size of the allocated area */
+static starpu_ssize_t allocate_vector_cpp_buffer_on_node(void *data_interface_, unsigned dst_node)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface_;
+
+	uint32_t nx = vector_interface->nx;
+	size_t elemsize = vector_interface->elemsize;
+
+	starpu_ssize_t allocated_memory;
+
+	const my_allocator<char> allocator(dst_node);
+	std::vector<MY_TYPE> * vec = new std::vector<MY_TYPE>(nx, 0, allocator);
+
+	vector_interface->vec = vec;
+	if (!vector_interface->vec)
+		return -ENOMEM;
+
+	allocated_memory = nx*elemsize;
+
+	/* update the data properly in consequence */
+	vector_interface->ptr = (uintptr_t) &((*vec)[0]);
+	vector_interface->dev_handle = (uintptr_t) &((*vec)[0]);
+        vector_interface->offset = 0;
+
+	return allocated_memory;
+}
+
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	delete vector_interface->vec;
+}
+
+static int vector_interface_copy_any_to_any(void *src_interface, unsigned src_node,
+                           void *dst_interface, unsigned dst_node, void *async_data)
+{
+	struct vector_cpp_interface *src_vector = (struct vector_cpp_interface *) src_interface;
+	struct vector_cpp_interface *dst_vector = (struct vector_cpp_interface *) dst_interface;
+	int ret;
+
+	ret = starpu_interface_copy(src_vector->dev_handle, src_vector->offset, src_node,
+				    dst_vector->dev_handle, dst_vector->offset, dst_node,
+				    src_vector->nx*src_vector->elemsize, async_data);
+
+	return ret;
+}
+
+static starpu_ssize_t vector_cpp_describe(void *data_interface, char *buf, size_t size)
+{
+	struct vector_cpp_interface *vector = (struct vector_cpp_interface *) data_interface;
+	return snprintf(buf, size, "V%ux%u",
+			(unsigned) vector->nx,
+			(unsigned) vector->elemsize);
+}
+
+/*
+ * End of interface
+ */
+
+
+
+/* Kernel using STL objects */
+
+void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)
+{
+	std::vector<MY_TYPE>* vec_A = VECTOR_CPP_GET_VEC(buffers[0]);
+	std::vector<MY_TYPE>* vec_B = VECTOR_CPP_GET_VEC(buffers[1]);
+	std::vector<MY_TYPE>* vec_C = VECTOR_CPP_GET_VEC(buffers[2]);
+
+	// all the std::vector have to have the same size
+	assert(vec_A->size() == vec_B->size() && vec_B->size() == vec_C->size());
+
+	// performs the vector addition (vec_C[] = vec_A[] + vec_B[])
+	for (size_t i = 0; i < vec_C->size(); i++)
+		(*vec_C)[i] = (*vec_A)[i] + (*vec_B)[i];
+}
+
+#define VEC_SIZE 1024
+
+int main(int argc, char **argv)
+{
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+
+	// initialize StarPU with default configuration
+	int ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Test data transfers between NUMA nodes if available */
+	unsigned last_numa_node = starpu_memory_nodes_get_numa_count() - 1;
+
+	const my_allocator<char> allocator_main_ram(STARPU_MAIN_RAM);
+	const my_allocator<char> allocator_last_numa(last_numa_node);
+	std::vector<MY_TYPE> vec_A(VEC_SIZE, 2, allocator_main_ram); // all the vector is initialized to 2
+	std::vector<MY_TYPE> vec_B(VEC_SIZE, 3, allocator_main_ram); // all the vector is initialized to 3
+	std::vector<MY_TYPE> vec_C(VEC_SIZE, 0, allocator_last_numa); // all the vector is initialized to 0
+
+	// StarPU data registering
+	starpu_data_handle_t spu_vec_A;
+	starpu_data_handle_t spu_vec_B;
+	starpu_data_handle_t spu_vec_C;
+
+	// give the data of the vector to StarPU (C array)
+	vector_cpp_data_register(&spu_vec_A, STARPU_MAIN_RAM, &vec_A, vec_A.size(), sizeof(char));
+	vector_cpp_data_register(&spu_vec_B, STARPU_MAIN_RAM, &vec_B, vec_B.size(), sizeof(char));
+	vector_cpp_data_register(&spu_vec_C, last_numa_node, &vec_C, vec_C.size(), sizeof(char));
+
+	// create the StarPU codelet
+	starpu_codelet cl;
+	starpu_codelet_init(&cl);
+	cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
+	cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
+	cl.nbuffers          = 3;
+	cl.modes         [0] = STARPU_R;
+	cl.modes         [1] = STARPU_R;
+	cl.modes         [2] = STARPU_W;
+	cl.name              = "add_vectors";
+
+	// submit a new StarPU task to execute
+	ret = starpu_task_insert(&cl,
+	                         STARPU_R, spu_vec_A,
+	                         STARPU_R, spu_vec_B,
+	                         STARPU_W, spu_vec_C,
+	                         0);
+	if (ret == -ENODEV)
+	{
+		// StarPU data unregistering
+		starpu_data_unregister(spu_vec_C);
+		starpu_data_unregister(spu_vec_B);
+		starpu_data_unregister(spu_vec_A);
+
+		// terminate StarPU, no task can be submitted after
+		starpu_shutdown();
+
+		return 77;
+	}
+
+	STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
+
+	// wait the task
+	starpu_task_wait_for_all();
+
+	// StarPU data unregistering
+	starpu_data_unregister(spu_vec_C);
+	starpu_data_unregister(spu_vec_B);
+	starpu_data_unregister(spu_vec_A);
+
+	// terminate StarPU, no task can be submitted after
+	starpu_shutdown();
+
+	// check results
+	bool fail = false;
+	int i = 0;
+	while (!fail && i < VEC_SIZE)
+		fail = vec_C[i++] != 5;
+
+	if (fail)
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example failed..." << std::endl;
+#endif
+		return EXIT_FAILURE;
+	}
+	else
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example successfully passed!" << std::endl;
+#endif
+		return EXIT_SUCCESS;
+	}
+}
+#endif

+ 1 - 1
examples/dependency/sequential_consistency.c

@@ -69,7 +69,7 @@ void cpu_codeletA(void *descr[], void *args)
 
 	ret = starpu_task_insert(&clB,
 				 STARPU_RW, value_handle,
-				 STARPU_CALLBACK_WITH_ARG, starpu_tag_notify_from_apps, tagHoldC,
+				 STARPU_CALLBACK_WITH_ARG_NFREE, starpu_tag_notify_from_apps, tagHoldC,
 				 STARPU_HANDLES_SEQUENTIAL_CONSISTENCY, handle_sequential_consistency,
 				 STARPU_NAME, "taskB",
 				 0);

+ 1 - 1
examples/dependency/task_end_dep_add.c

@@ -47,7 +47,7 @@ void cpu_codelet(void *descr[], void *args)
 	starpu_task_end_dep_add(task, 1);
 
 	starpu_task_insert(&cl2,
-			   STARPU_CALLBACK_WITH_ARG, starpu_task_end_dep_release, task,
+			   STARPU_CALLBACK_WITH_ARG_NFREE, starpu_task_end_dep_release, task,
 			   0);
 	STARPU_ASSERT(*val == INIT);
 	*val *= 2;

+ 131 - 0
examples/perf_monitoring/perf_counters_01.c

@@ -0,0 +1,131 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+#include <string.h>
+
+static void print_scope(const enum starpu_perf_counter_scope scope)
+{
+	int nb = starpu_perf_counter_nb(scope);
+	int i;
+	printf("scope %s\n", starpu_perf_counter_scope_id_to_name(scope));
+	for (i=0; i<nb; i++)
+	{
+		const int id = starpu_perf_counter_nth_to_id(scope, i);
+		const char *name = starpu_perf_counter_id_to_name(id);
+		const char *help = starpu_perf_counter_get_help_string(id);
+		int type_id = starpu_perf_counter_get_type_id(id);
+		const char *type_name = starpu_perf_counter_type_id_to_name(type_id);
+		printf("%d/%d - %s (0x%08x): [%s] / %s\n", i+1, nb, name, id, type_name, help);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		int id;
+
+		id = starpu_perf_counter_scope_name_to_id("global");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_global);
+		
+		id = starpu_perf_counter_scope_name_to_id("per_worker");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_per_worker);
+		
+		id = starpu_perf_counter_scope_name_to_id("per_codelet");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_per_codelet);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_global);
+		STARPU_ASSERT(strcmp(name, "global") == 0);
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_per_worker);
+		STARPU_ASSERT(strcmp(name, "per_worker") == 0);
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_per_codelet);
+		STARPU_ASSERT(strcmp(name, "per_codelet") == 0);
+
+		(void)name;
+	}
+
+	{
+		int id;
+
+		id = starpu_perf_counter_type_name_to_id("int32");
+		STARPU_ASSERT(id == starpu_perf_counter_type_int32);
+
+		id = starpu_perf_counter_type_name_to_id("int64");
+		STARPU_ASSERT(id == starpu_perf_counter_type_int64);
+
+		id = starpu_perf_counter_type_name_to_id("float");
+		STARPU_ASSERT(id == starpu_perf_counter_type_float);
+
+		id = starpu_perf_counter_type_name_to_id("double");
+		STARPU_ASSERT(id == starpu_perf_counter_type_double);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_int32);
+		STARPU_ASSERT(strcmp(name, "int32") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_int64);
+		STARPU_ASSERT(strcmp(name, "int64") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_float);
+		STARPU_ASSERT(strcmp(name, "float") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_double);
+		STARPU_ASSERT(strcmp(name, "double") == 0);
+
+		(void)name;
+	}
+
+	printf("programmatically get counters per scope\n");
+	print_scope(starpu_perf_counter_scope_global);
+	print_scope(starpu_perf_counter_scope_per_worker);
+	print_scope(starpu_perf_counter_scope_per_codelet);
+	printf("\n");
+
+	printf("list available counters per scope\n");
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_global);
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_worker);
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_codelet);
+	printf("\n");
+
+	printf("list all available counters\n");
+	starpu_perf_counter_list_all_avail();
+	printf("\n");
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 241 - 0
examples/perf_monitoring/perf_counters_02.c

@@ -0,0 +1,241 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+/* global counters */
+static int id_g_total_submitted;
+static int id_g_peak_submitted;
+static int id_g_peak_ready;
+
+/* per worker counters */
+static int id_w_total_executed;
+static int id_w_cumul_execution_time;
+
+/* per_codelet counters */
+static int id_c_total_submitted;
+static int id_c_peak_submitted;
+static int id_c_peak_ready;
+static int id_c_total_executed;
+static int id_c_cumul_execution_time;
+
+void g_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	(void) context;
+	int64_t g_total_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_g_total_submitted);
+	int64_t g_peak_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_g_peak_submitted);
+	int64_t g_peak_ready = starpu_perf_counter_sample_get_int64_value(sample, id_g_peak_ready);
+	printf("global: g_total_submitted = %ld, g_peak_submitted = %ld, g_peak_ready = %ld\n", g_total_submitted, g_peak_submitted, g_peak_ready);
+}
+
+void w_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	(void) context;
+	int workerid = starpu_worker_get_id();
+	int64_t w_total_executed = starpu_perf_counter_sample_get_int64_value(sample, id_w_total_executed);
+	double w_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_w_cumul_execution_time);
+
+	printf("worker[%d]: w_total_executed = %ld, w_cumul_execution_time = %lf\n", workerid, w_total_executed, w_cumul_execution_time);
+}
+
+void c_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	struct starpu_codelet *cl = context;
+	int64_t c_total_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_c_total_submitted);
+	int64_t c_peak_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_c_peak_submitted);
+	int64_t c_peak_ready = starpu_perf_counter_sample_get_int64_value(sample, id_c_peak_ready);
+	int64_t c_total_executed = starpu_perf_counter_sample_get_int64_value(sample, id_c_total_executed);
+	double c_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_c_cumul_execution_time);
+	if (cl->name == NULL)
+	{
+		printf("codelet[%s]: c_total_submitted = %ld, c_peak_submitted = %ld, c_peak_ready = %ld, c_total_executed = %ld, c_cumul_execution_time = %lf\n", cl->name, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+	}
+	else
+	{
+		printf("codelet[%p]: c_total_submitted = %ld, c_peak_submitted = %ld, c_peak_ready = %ld, c_total_executed = %ld, c_cumul_execution_time = %lf\n", cl, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+	}
+}
+
+void f(void *buffers[], void *cl_args)
+{
+	int *int_vector = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
+	int NX = (int)STARPU_VECTOR_GET_NX(buffers[0]);
+	const int niters;
+	starpu_codelet_unpack_args(cl_args, &niters);
+	int i;
+	for (i=0; i<niters; i++)
+	{
+		int_vector[i % NX] += i;
+	}
+}
+
+struct starpu_codelet cl =
+{
+	.cpu_funcs      = {f},
+	.cpu_funcs_name = {"f"},
+	.nbuffers       = 1,
+	.name           = "perf_counter_f"
+};
+
+const enum starpu_perf_counter_scope g_scope = starpu_perf_counter_scope_global;
+const enum starpu_perf_counter_scope w_scope = starpu_perf_counter_scope_per_worker;
+const enum starpu_perf_counter_scope c_scope = starpu_perf_counter_scope_per_codelet;
+
+#define NVECTORS 5
+#define NTASKS 1000
+#define NITER 1000
+#define VECTOR_LEN 2
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	struct starpu_perf_counter_set *g_set = starpu_perf_counter_set_alloc(g_scope);
+	STARPU_ASSERT(g_set != NULL);
+	struct starpu_perf_counter_set *w_set = starpu_perf_counter_set_alloc(w_scope);
+	STARPU_ASSERT(w_set != NULL);
+	struct starpu_perf_counter_set *c_set = starpu_perf_counter_set_alloc(c_scope);
+	STARPU_ASSERT(c_set != NULL);
+
+	id_g_total_submitted = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_total_submitted");
+	STARPU_ASSERT(id_g_total_submitted != -1);
+	id_g_peak_submitted = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_peak_submitted");
+	STARPU_ASSERT(id_g_peak_submitted != -1);
+	id_g_peak_ready = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_peak_ready");
+	STARPU_ASSERT(id_g_peak_ready != -1);
+
+
+	id_w_total_executed = starpu_perf_counter_name_to_id(w_scope, "starpu.task.w_total_executed");
+	STARPU_ASSERT(id_w_total_executed != -1);
+	id_w_cumul_execution_time = starpu_perf_counter_name_to_id(w_scope, "starpu.task.w_cumul_execution_time");
+	STARPU_ASSERT(id_w_cumul_execution_time != -1);
+
+	id_c_total_submitted = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_total_submitted");
+	STARPU_ASSERT(id_c_total_submitted != -1);
+	id_c_peak_submitted = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_peak_submitted");
+	STARPU_ASSERT(id_c_peak_submitted != -1);
+	id_c_peak_ready = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_peak_ready");
+	STARPU_ASSERT(id_c_peak_ready != -1);
+	id_c_total_executed = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_total_executed");
+	STARPU_ASSERT(id_c_total_executed != -1);
+	id_c_cumul_execution_time = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_cumul_execution_time");
+	STARPU_ASSERT(id_c_cumul_execution_time != -1);
+
+	starpu_perf_counter_set_enable_id(g_set, id_g_total_submitted);
+	starpu_perf_counter_set_enable_id(g_set, id_g_peak_submitted);
+	starpu_perf_counter_set_enable_id(g_set, id_g_peak_ready);
+
+	starpu_perf_counter_set_enable_id(w_set, id_w_total_executed);
+	starpu_perf_counter_set_enable_id(w_set, id_w_cumul_execution_time);
+
+	starpu_perf_counter_set_enable_id(c_set, id_c_total_submitted);
+	starpu_perf_counter_set_enable_id(c_set, id_c_peak_submitted);
+	starpu_perf_counter_set_enable_id(c_set, id_c_peak_ready);
+	starpu_perf_counter_set_enable_id(c_set, id_c_total_executed);
+	starpu_perf_counter_set_enable_id(c_set, id_c_cumul_execution_time);
+
+	struct starpu_perf_counter_listener * g_listener = starpu_perf_counter_listener_init(g_set, g_listener_cb, (void *)(uintptr_t)42);
+	struct starpu_perf_counter_listener * w_listener = starpu_perf_counter_listener_init(w_set, w_listener_cb, (void *)(uintptr_t)17);
+	struct starpu_perf_counter_listener * c_listener = starpu_perf_counter_listener_init(c_set, c_listener_cb, (void *)(uintptr_t)76);
+
+	starpu_perf_counter_set_global_listener(g_listener);
+	starpu_perf_counter_set_all_per_worker_listeners(w_listener);
+
+	starpu_perf_counter_set_per_codelet_listener(&cl, c_listener);
+
+	int* vector[NVECTORS];
+	starpu_data_handle_t vector_h[NVECTORS];
+	int v;
+	for (v=0; v<NVECTORS; v++)
+	{
+		vector[v] = calloc(VECTOR_LEN, sizeof(*(vector[v])));
+		STARPU_ASSERT(vector[v] != NULL);
+
+		{
+			int i;
+			for (i=0; i<VECTOR_LEN; i++)
+			{
+				vector[v][i] = i;
+			}
+		}
+
+		starpu_vector_data_register(&vector_h[v], STARPU_MAIN_RAM, (uintptr_t)vector[v], VECTOR_LEN, sizeof(*vector[v]));
+	}
+
+	{
+		int i;
+		for (i=0; i<NTASKS; i++)
+		{
+			v = i % NVECTORS;
+			const int niter = NITER;
+			starpu_insert_task(&cl,
+					STARPU_RW, vector_h[v],
+					STARPU_VALUE, &niter, sizeof(int),
+					0);
+		}
+	}
+
+	for (v=0; v<NVECTORS; v++)
+	{
+		starpu_data_unregister(vector_h[v]);
+		free(vector[v]);
+	}
+
+	starpu_perf_counter_unset_per_codelet_listener(&cl);
+	starpu_perf_counter_unset_all_per_worker_listeners();
+	starpu_perf_counter_unset_global_listener();
+
+	starpu_perf_counter_listener_exit(c_listener);
+	starpu_perf_counter_listener_exit(w_listener);
+	starpu_perf_counter_listener_exit(g_listener);
+
+	starpu_perf_counter_set_disable_id(c_set, id_c_cumul_execution_time);
+	starpu_perf_counter_set_disable_id(c_set, id_c_total_executed);
+	starpu_perf_counter_set_disable_id(c_set, id_c_peak_ready);
+	starpu_perf_counter_set_disable_id(c_set, id_c_peak_submitted);
+	starpu_perf_counter_set_disable_id(c_set, id_c_total_submitted);
+
+	starpu_perf_counter_set_disable_id(w_set, id_w_cumul_execution_time);
+	starpu_perf_counter_set_disable_id(w_set, id_w_total_executed);
+
+	starpu_perf_counter_set_disable_id(g_set, id_g_peak_ready);
+	starpu_perf_counter_set_disable_id(g_set, id_g_peak_submitted);
+	starpu_perf_counter_set_disable_id(g_set, id_g_total_submitted);
+
+	starpu_perf_counter_set_free(c_set);
+	c_set = NULL;
+
+	starpu_perf_counter_set_free(w_set);
+	w_set = NULL;
+
+	starpu_perf_counter_set_free(g_set);
+	g_set = NULL;
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 131 - 0
examples/perf_steering/perf_knobs_01.c

@@ -0,0 +1,131 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+#include <string.h>
+
+static void print_scope(const enum starpu_perf_knob_scope scope)
+{
+	int nb = starpu_perf_knob_nb(scope);
+	int i;
+	printf("scope %s\n", starpu_perf_knob_scope_id_to_name(scope));
+	for (i=0; i<nb; i++)
+	{
+		const int id = starpu_perf_knob_nth_to_id(scope, i);
+		const char *name = starpu_perf_knob_id_to_name(id);
+		const char *help = starpu_perf_knob_get_help_string(id);
+		int type_id = starpu_perf_knob_get_type_id(id);
+		const char *type_name = starpu_perf_knob_type_id_to_name(type_id);
+		printf("%d/%d - %s (0x%08x): [%s] / %s\n", i+1, nb, name, id, type_name, help);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		int id;
+
+		id = starpu_perf_knob_scope_name_to_id("global");
+		STARPU_ASSERT(id == starpu_perf_knob_scope_global);
+		
+		id = starpu_perf_knob_scope_name_to_id("per_worker");
+		STARPU_ASSERT(id == starpu_perf_knob_scope_per_worker);
+		
+		id = starpu_perf_knob_scope_name_to_id("per_scheduler");
+		STARPU_ASSERT(id == starpu_perf_knob_scope_per_scheduler);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_knob_scope_id_to_name(starpu_perf_knob_scope_global);
+		STARPU_ASSERT(strcmp(name, "global") == 0);
+		
+		name = starpu_perf_knob_scope_id_to_name(starpu_perf_knob_scope_per_worker);
+		STARPU_ASSERT(strcmp(name, "per_worker") == 0);
+		
+		name = starpu_perf_knob_scope_id_to_name(starpu_perf_knob_scope_per_scheduler);
+		STARPU_ASSERT(strcmp(name, "per_scheduler") == 0);
+
+		(void)name;
+	}
+
+	{
+		int id;
+
+		id = starpu_perf_knob_type_name_to_id("int32");
+		STARPU_ASSERT(id == starpu_perf_knob_type_int32);
+
+		id = starpu_perf_knob_type_name_to_id("int64");
+		STARPU_ASSERT(id == starpu_perf_knob_type_int64);
+
+		id = starpu_perf_knob_type_name_to_id("float");
+		STARPU_ASSERT(id == starpu_perf_knob_type_float);
+
+		id = starpu_perf_knob_type_name_to_id("double");
+		STARPU_ASSERT(id == starpu_perf_knob_type_double);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_knob_type_id_to_name(starpu_perf_knob_type_int32);
+		STARPU_ASSERT(strcmp(name, "int32") == 0);
+		
+		name = starpu_perf_knob_type_id_to_name(starpu_perf_knob_type_int64);
+		STARPU_ASSERT(strcmp(name, "int64") == 0);
+		
+		name = starpu_perf_knob_type_id_to_name(starpu_perf_knob_type_float);
+		STARPU_ASSERT(strcmp(name, "float") == 0);
+		
+		name = starpu_perf_knob_type_id_to_name(starpu_perf_knob_type_double);
+		STARPU_ASSERT(strcmp(name, "double") == 0);
+
+		(void)name;
+	}
+
+	printf("programmatically get knobs per scope\n");
+	print_scope(starpu_perf_knob_scope_global);
+	print_scope(starpu_perf_knob_scope_per_worker);
+	print_scope(starpu_perf_knob_scope_per_scheduler);
+	printf("\n");
+
+	printf("list available knobs per scope\n");
+	starpu_perf_knob_list_avail(starpu_perf_knob_scope_global);
+	starpu_perf_knob_list_avail(starpu_perf_knob_scope_per_worker);
+	starpu_perf_knob_list_avail(starpu_perf_knob_scope_per_scheduler);
+	printf("\n");
+
+	printf("list all available knobs\n");
+	starpu_perf_knob_list_all_avail();
+	printf("\n");
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 149 - 0
examples/perf_steering/perf_knobs_02.c

@@ -0,0 +1,149 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+#include <string.h>
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		const char * const knob_name       = "starpu.global.g_calibrate_knob";
+		const char * const knob_scope_name = "global";
+		const char * const knob_type_name  = "int32";
+		int32_t val, val_save;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", knob_name);
+
+		val_save = val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+
+		starpu_perf_knob_set_global_int32_value(id, 1);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == 1);
+
+		starpu_perf_knob_set_global_int32_value(id, 0);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == 0);
+
+		starpu_perf_knob_set_global_int32_value(id, val_save);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == val_save);
+	}
+
+	{
+		const char * const knob_name       = "starpu.global.g_enable_catch_signal_knob";
+		const char * const knob_scope_name = "global";
+		const char * const knob_type_name  = "int32";
+		int32_t val, val_save;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", knob_name);
+
+		val_save = val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+
+		starpu_perf_knob_set_global_int32_value(id, 1);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == 1);
+
+		starpu_perf_knob_set_global_int32_value(id, 0);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == 0);
+
+		starpu_perf_knob_set_global_int32_value(id, val_save);
+		val = starpu_perf_knob_get_global_int32_value(id);
+		printf("- %d\n", val);
+		STARPU_ASSERT(val == val_save);
+	}
+
+
+	{
+		const char * const knob_name       = "starpu.worker.w_bind_to_pu_knob";
+		const char * const knob_scope_name = "per_worker";
+		const char * const knob_type_name  = "int32";
+		int32_t val;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", knob_name);
+
+		unsigned int ncpu  = starpu_cpu_worker_get_count();
+		unsigned int i;
+		for (i=0; i<ncpu; i++)
+		{
+			val = starpu_perf_knob_get_per_worker_int32_value(id, i);
+			STARPU_ASSERT(val >= 0);
+			printf("- %u: %d\n", i, val);
+		}
+	}
+
+	{
+		const char * const knob_name       = "starpu.task.s_max_priority_cap_knob";
+		const char * const knob_scope_name = "per_scheduler";
+		const char * const knob_type_name  = "int32";
+		int32_t val;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", knob_name);
+		val = starpu_perf_knob_get_per_scheduler_int32_value(id, "prio");
+		printf("- %d\n", val);
+	}
+
+	{
+		const char * const knob_name       = "starpu.task.s_min_priority_cap_knob";
+		const char * const knob_scope_name = "per_scheduler";
+		const char * const knob_type_name  = "int32";
+		int32_t val;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", knob_name);
+		val = starpu_perf_knob_get_per_scheduler_int32_value(id, "prio");
+		printf("- %d\n", val);
+	}
+
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 180 - 0
examples/perf_steering/perf_knobs_03.c

@@ -0,0 +1,180 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+#include <string.h>
+
+#define NTASKS 100
+
+volatile int task_count[2];
+
+void cpu_func(void *buffer[], void *cl_arg)
+{
+	(void)buffer;
+	(void)cl_arg;
+	int workerid = starpu_worker_get_id();
+	STARPU_ASSERT(workerid == 0 || workerid == 1);
+	task_count[workerid]++;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncpus = 2;
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	conf.nmic = 0;
+	conf.nmpi_ms = 0;
+	{
+		const char *sched_pol_name = starpu_getenv("STARPU_SCHED");
+		if (sched_pol_name != NULL && strcmp(sched_pol_name, "prio") != 0)
+		{
+			fprintf(stderr, "example uses 'prio' scheduling policy.\n");
+			return 77;
+		}
+	}
+
+	conf.sched_policy_name = "prio";
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (starpu_cpu_worker_get_count() != 2
+		|| starpu_cuda_worker_get_count() != 0
+		|| starpu_opencl_worker_get_count() != 0
+		|| starpu_mic_worker_get_count() != 0
+		|| starpu_mpi_ms_worker_get_count() != 0)
+	{
+		fprintf(stderr, "example needs exactly two cpu cores.\n");
+		return 77;
+	}
+
+	{
+		const char * const max_prio_knob_name       = "starpu.task.s_max_priority_cap_knob";
+		const char * const min_prio_knob_name       = "starpu.task.s_min_priority_cap_knob";
+		const char * const knob_scope_name = "per_scheduler";
+		const char * const knob_type_name  = "int32";
+		int32_t max_prio_val;
+		int32_t min_prio_val;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+
+		const int max_prio_id = starpu_perf_knob_name_to_id(scope_id, max_prio_knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(max_prio_id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		const int min_prio_id = starpu_perf_knob_name_to_id(scope_id, min_prio_knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(min_prio_id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		printf("%s:\n", max_prio_knob_name);
+		max_prio_val = starpu_perf_knob_get_per_scheduler_int32_value(max_prio_id, "prio");
+		printf("- %d\n", max_prio_val);
+
+		printf("%s:\n", min_prio_knob_name);
+		min_prio_val = starpu_perf_knob_get_per_scheduler_int32_value(min_prio_id, "prio");
+		printf("- %d\n", min_prio_val);
+		STARPU_ASSERT (max_prio_val >= min_prio_val);
+
+		if (min_prio_val > 0)
+		{
+			starpu_perf_knob_set_per_scheduler_int32_value(min_prio_id, "prio", 0);
+			starpu_perf_knob_set_per_scheduler_int32_value(max_prio_id, "prio", 0);
+		}
+		else
+		{
+			starpu_perf_knob_set_per_scheduler_int32_value(max_prio_id, "prio", 0);
+			starpu_perf_knob_set_per_scheduler_int32_value(min_prio_id, "prio", 0);
+		}
+
+		printf("%s:\n", max_prio_knob_name);
+		max_prio_val = starpu_perf_knob_get_per_scheduler_int32_value(max_prio_id, "prio");
+		printf("- %d\n", max_prio_val);
+
+		printf("%s:\n", min_prio_knob_name);
+		min_prio_val = starpu_perf_knob_get_per_scheduler_int32_value(min_prio_id, "prio");
+		printf("- %d\n", min_prio_val);
+		STARPU_ASSERT (max_prio_val == 0);
+		STARPU_ASSERT (min_prio_val == 0);
+
+	}
+
+	{
+		const char * const knob_name       = "starpu.worker.w_enable_worker_knob";
+		const char * const knob_scope_name = "per_worker";
+		const char * const knob_type_name  = "int32";
+		int32_t val;
+
+		const int scope_id = starpu_perf_knob_scope_name_to_id(knob_scope_name);
+		const int id = starpu_perf_knob_name_to_id(scope_id, knob_name);
+		STARPU_ASSERT(starpu_perf_knob_get_type_id(id) == starpu_perf_knob_type_name_to_id(knob_type_name));
+
+		struct starpu_codelet cl = {
+			.cpu_funcs = {cpu_func}
+		};
+
+		task_count[0] = 0;
+		task_count[1] = 0;
+
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 0);
+		STARPU_ASSERT(val == 1);
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 1);
+		STARPU_ASSERT(val == 1);
+
+		starpu_perf_knob_set_per_worker_int32_value(id, 1, 0);
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 1);
+		STARPU_ASSERT(val == 0);
+
+		int i;
+		for (i=0; i<NTASKS; i++)
+		{
+			starpu_task_insert(&cl, 0);
+		}
+		starpu_task_wait_for_all();
+		STARPU_ASSERT(task_count[0] == NTASKS);
+		STARPU_ASSERT(task_count[1] == 0);
+
+		task_count[0] = 0;
+
+		starpu_perf_knob_set_per_worker_int32_value(id, 1, 1);
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 1);
+		STARPU_ASSERT(val == 1);
+
+		starpu_perf_knob_set_per_worker_int32_value(id, 0, 0);
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 0);
+		STARPU_ASSERT(val == 0);
+
+		for (i=0; i<NTASKS; i++)
+		{
+			starpu_task_insert(&cl, 0);
+		}
+		starpu_task_wait_for_all();
+		STARPU_ASSERT(task_count[0] == 0);
+		STARPU_ASSERT(task_count[1] == NTASKS);
+
+		starpu_perf_knob_set_per_worker_int32_value(id, 0, 1);
+		val = starpu_perf_knob_get_per_worker_int32_value(id, 0);
+		STARPU_ASSERT(val == 1);
+	}
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 2 - 2
examples/pipeline/pipeline.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2015,2017                           CNRS
+ * Copyright (C) 2012-2015,2017,2019                      CNRS
  * Copyright (C) 2012,2014-2017                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -243,7 +243,7 @@ int main(void)
 
 		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
-				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
+				STARPU_CALLBACK_WITH_ARG_NFREE, (void (*)(void*))sem_post, &sems[l%C],
 				STARPU_TAG_ONLY, (starpu_tag_t) l,
 				0);
 		if (ret == -ENODEV) goto enodev;

+ 5 - 4
examples/scheduler/dummy_modular_sched.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2012,2013,2015                           Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
@@ -229,9 +229,10 @@ int main(void)
 	int ret;
 	struct starpu_conf conf;
 
-#ifdef STARPU_HAVE_UNSETENV
-	unsetenv("STARPU_SCHED");
-#endif
+	char *sched = getenv("STARPU_SCHED");
+	if (sched && sched[0])
+		/* Testing a specific scheduler, no need to run this */
+		return 77;
 
 	starpu_conf_init(&conf);
 	conf.sched_policy = &dummy_sched_policy,

+ 5 - 4
examples/scheduler/dummy_sched.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015                           Inria
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -158,9 +158,10 @@ int main(void)
 	int ret;
 	struct starpu_conf conf;
 
-#ifdef STARPU_HAVE_UNSETENV
-	unsetenv("STARPU_SCHED");
-#endif
+	char *sched = getenv("STARPU_SCHED");
+	if (sched && sched[0])
+		/* Testing a specific scheduler, no need to run this */
+		return 77;
 
 	starpu_conf_init(&conf);
 	conf.sched_policy = &dummy_sched_policy,

+ 6 - 1
examples/scheduler/schedulers.sh

@@ -30,7 +30,12 @@ then
     exit 77
 fi
 
-SCHEDULERS=`../tools/starpu_sched_display | grep -v heteroprio`
+if [ -n "$STARPU_SCHED" ]
+then
+	SCHEDULERS=$STARPU_SCHED
+else
+	SCHEDULERS=`../tools/starpu_sched_display | grep -v heteroprio`
+fi
 
 for sched in $SCHEDULERS
 do

+ 6 - 1
examples/scheduler/schedulers_context.sh

@@ -29,7 +29,12 @@ then
     exit 77
 fi
 
-SCHEDULERS=`../tools/starpu_sched_display | grep -v pheft | grep -v peager | grep -v heteroprio | grep -v modular-gemm`
+if [ -n "$STARPU_SCHED" ]
+then
+	SCHEDULERS="$STARPU_SCHED"
+else
+	SCHEDULERS=`../tools/starpu_sched_display | grep -v pheft | grep -v peager | grep -v heteroprio | grep -v modular-gemm`
+fi
 
 for sched in $SCHEDULERS
 do

+ 2 - 1
examples/stencil/Makefile.am

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2011,2016,2017                           Inria
 # Copyright (C) 2011-2019                                CNRS
-# Copyright (C) 2010-2017                                Université de Bordeaux
+# Copyright (C) 2010-2017,2019                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -27,6 +27,7 @@ LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 if STARPU_SIMGRID
 MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
+NVCCFLAGS		+= --compiler-options -fPIC
 else
 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 endif

+ 3 - 1
include/fstarpu_mod.f90

@@ -2,7 +2,7 @@
 !
 ! Copyright (C) 2016,2017                                Inria
 ! Copyright (C) 2017,2018,2019                           CNRS
-! Copyright (C) 2016-2018                                Université de Bordeaux
+! Copyright (C) 2016-2019                                Université de Bordeaux
 !
 ! StarPU is free software; you can redistribute it and/or modify
 ! it under the terms of the GNU Lesser General Public License as published by
@@ -60,6 +60,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_HANDLES_SEQUENTIAL_CONSISTENCY
         type(c_ptr), bind(C) :: FSTARPU_TASK_END_DEP
         type(c_ptr), bind(C) :: FSTARPU_NODE_SELECTION_POLICY
+        type(c_ptr), bind(C) :: FSTARPU_TASK_SCHED_DATA
 
         type(c_ptr), bind(C) :: FSTARPU_VALUE
         type(c_ptr), bind(C) :: FSTARPU_SCHED_CTX
@@ -2279,6 +2280,7 @@ module fstarpu_mod
                         FSTARPU_TAG_ONLY        = fstarpu_get_constant(C_CHAR_"FSTARPU_TAG_ONLY"//C_NULL_CHAR)
                         FSTARPU_NAME    = fstarpu_get_constant(C_CHAR_"FSTARPU_NAME"//C_NULL_CHAR)
                         FSTARPU_NODE_SELECTION_POLICY   = fstarpu_get_constant(C_CHAR_"FSTARPU_NODE_SELECTION_POLICY"//C_NULL_CHAR)
+                        FSTARPU_TASK_SCHED_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_SCHED_DATA"//C_NULL_CHAR)
 
                         FSTARPU_VALUE   = fstarpu_get_constant(C_CHAR_"FSTARPU_VALUE"//C_NULL_CHAR)
                         FSTARPU_SCHED_CTX   = fstarpu_get_constant(C_CHAR_"FSTARPU_SCHED_CTX"//C_NULL_CHAR)

+ 2 - 0
include/starpu.h

@@ -83,6 +83,8 @@ typedef INT_PTR intptr_t;
 #include <starpu_simgrid_wrap.h>
 #include <starpu_bitmap.h>
 #include <starpu_clusters.h>
+#include <starpu_perf_monitoring.h>
+#include <starpu_perf_steering.h>
 
 #ifdef __cplusplus
 extern "C"

+ 6 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2010-2017,2019                           CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,6 +43,11 @@
 #undef STARPU_SIMGRID_MC
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 #undef STARPU_HAVE_SIMGRID_MSG_H
+#undef STARPU_HAVE_SIMGRID_ACTOR_H
+#undef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#undef STARPU_HAVE_SIMGRID_MUTEX_H
+#undef STARPU_HAVE_SIMGRID_COND_H
+#undef STARPU_HAVE_SIMGRID_BARRIER_H
 #undef STARPU_HAVE_XBT_SYNCHRO_H
 #undef STARPU_HAVE_VALGRIND_H
 #undef STARPU_HAVE_MEMCHECK_H

+ 13 - 0
include/starpu_data_interfaces.h

@@ -467,6 +467,14 @@ struct starpu_data_interface_ops
 	size_t 		 (*get_alloc_size)		(starpu_data_handle_t handle);
 
 	/**
+	   Return the maximum size that the data may need to increase to. For
+	   instance, in the case of compressed matrix tiles this is the size
+	   when the block is fully dense.
+	   This is currently only used for feedback tools.
+	*/
+	size_t 		 (*get_max_size)		(starpu_data_handle_t handle);
+
+	/**
 	  Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.), required for indexing performance models.
 
 	  starpu_hash_crc32c_be() and alike can be used to produce this 32bit value from various types of values.
@@ -667,6 +675,11 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 size_t starpu_data_get_alloc_size(starpu_data_handle_t handle);
 
 /**
+   Return the maximum size that the \p handle data may need to increase to.
+*/
+starpu_ssize_t starpu_data_get_max_size(starpu_data_handle_t handle);
+
+/**
    Return the handle corresponding to the data pointed to by the \p ptr host pointer.
 */
 starpu_data_handle_t starpu_data_lookup(const void *ptr);

+ 203 - 0
include/starpu_perf_monitoring.h

@@ -0,0 +1,203 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_PERF_MONITORING_H__
+#define __STARPU_PERF_MONITORING_H__
+
+#include <starpu.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+   @defgroup API_Perf_Monitoring Performance Monitoring Counters
+   @brief This section describes the interface to access performance monitoring counters.
+   @{
+*/
+
+/**
+   @name API
+   \anchor PM_API
+   @{
+*/
+/**
+   Enum of all possible performance counter scopes.
+ */
+enum starpu_perf_counter_scope
+{
+	starpu_perf_counter_scope_undefined     = 0, /** undefined scope */
+	starpu_perf_counter_scope_global        = 2, /** global scope */
+	starpu_perf_counter_scope_per_worker    = 4, /** per-worker scope */
+	starpu_perf_counter_scope_per_codelet   = 6  /** per-codelet scope */
+};
+
+/**
+  Enum of all possible performance counter value type.
+ */
+enum starpu_perf_counter_type
+{
+	starpu_perf_counter_type_undefined = 0, /** underfined value type */
+	starpu_perf_counter_type_int32     = 1, /** signed 32-bit integer value */
+	starpu_perf_counter_type_int64     = 2, /** signed 64-bit integer value */
+	starpu_perf_counter_type_float     = 3, /** 32-bit single precision floating-point value */
+	starpu_perf_counter_type_double    = 4  /** 64-bit double precision floating-point value */
+};
+
+struct starpu_perf_counter_listener;
+struct starpu_perf_counter_sample;
+struct starpu_perf_counter_set;
+
+/**
+  Translate scope name constant string to scope id.
+  */
+int starpu_perf_counter_scope_name_to_id(const char *name);
+/**
+  Translate scope id to scope name constant string.
+  */
+const char *starpu_perf_counter_scope_id_to_name(enum starpu_perf_counter_scope scope);
+
+/**
+  Translate type name constant string to type id.
+  */
+int starpu_perf_counter_type_name_to_id(const char *name);
+/**
+  Translate type id to type name constant string.
+  */
+const char *starpu_perf_counter_type_id_to_name(enum starpu_perf_counter_type type);
+
+/**
+  Return the number of performance counters for the given scope.
+  */
+int starpu_perf_counter_nb(enum starpu_perf_counter_scope scope);
+/**
+  Translate a performance counter name to its id.
+  */
+int starpu_perf_counter_name_to_id(enum starpu_perf_counter_scope scope, const char *name);
+/**
+  Translate a performance counter rank in its scope to its counter id.
+  */
+int starpu_perf_counter_nth_to_id(enum starpu_perf_counter_scope scope, int nth);
+/**
+  Translate a counter id to its name constant string.
+  */
+const char *starpu_perf_counter_id_to_name(int id);
+/**
+  Return the counter's type id.
+  */
+int starpu_perf_counter_get_type_id(int id);
+/**
+  Return the counter's help string.
+  */
+const char *starpu_perf_counter_get_help_string(int id);
+
+/**
+  Display the list of counters defined in the given scope.
+  */
+void starpu_perf_counter_list_avail(enum starpu_perf_counter_scope scope);
+/**
+  Display the list of counters defined in all scopes.
+  */
+void starpu_perf_counter_list_all_avail(void);
+
+/**
+  Allocate a new performance counter set.
+  */
+struct starpu_perf_counter_set *starpu_perf_counter_set_alloc(enum starpu_perf_counter_scope scope);
+/**
+  Free a performance counter set.
+  */
+void starpu_perf_counter_set_free(struct starpu_perf_counter_set *set);
+
+/**
+  Enable a given counter in the set.
+  */
+void starpu_perf_counter_set_enable_id(struct starpu_perf_counter_set *set, int id);
+/**
+  Disable a given counter in the set.
+  */
+void starpu_perf_counter_set_disable_id(struct starpu_perf_counter_set *set, int id);
+
+/**
+  Initialize a new performance counter listener.
+  */
+struct starpu_perf_counter_listener *starpu_perf_counter_listener_init(struct starpu_perf_counter_set *set, void (*callback)(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context), void *user_arg);
+/**
+  End a performance counter listener.
+  */
+void starpu_perf_counter_listener_exit(struct starpu_perf_counter_listener *listener);
+
+/**
+  Set a listener for the global scope.
+  */
+void starpu_perf_counter_set_global_listener(struct starpu_perf_counter_listener *listener);
+/**
+  Set a listener for the per_worker scope on a given worker.
+  */
+void starpu_perf_counter_set_per_worker_listener(unsigned workerid, struct starpu_perf_counter_listener *listener);
+/**
+  Set a common listener for all workers.
+  */
+void starpu_perf_counter_set_all_per_worker_listeners(struct starpu_perf_counter_listener *listener);
+/**
+  Set a per_codelet listener for a codelet.
+  */
+void starpu_perf_counter_set_per_codelet_listener(struct starpu_codelet *cl, struct starpu_perf_counter_listener *listener);
+
+/**
+  Unset the global listener.
+  */
+void starpu_perf_counter_unset_global_listener();
+/**
+  Unset the per_worker listener.
+  */
+void starpu_perf_counter_unset_per_worker_listener(unsigned workerid);
+/**
+  Unset all per_worker listeners.
+  */
+void starpu_perf_counter_unset_all_per_worker_listeners(void);
+/**
+  Unset a per_codelet listener.
+  */
+void starpu_perf_counter_unset_per_codelet_listener(struct starpu_codelet *cl);
+
+/**
+  Read an int32 counter value from a sample.
+  */
+int32_t starpu_perf_counter_sample_get_int32_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+/**
+  Read an int64 counter value from a sample.
+  */
+int64_t starpu_perf_counter_sample_get_int64_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+/**
+  Read a float counter value from a sample.
+  */
+float starpu_perf_counter_sample_get_float_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+/**
+  Read a double counter value from a sample.
+  */
+double starpu_perf_counter_sample_get_double_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+
+/** @} */
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_PERF_MONITORING_H__ */

+ 225 - 0
include/starpu_perf_steering.h

@@ -0,0 +1,225 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_PERF_STEERING_H__
+#define __STARPU_PERF_STEERING_H__
+
+#include <starpu.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+   @defgroup API_Perf_Steering Performance Steering Knobs
+   @brief This section describes the interface to access performance steering counters.
+   @{
+*/
+
+/**
+   @name API
+   \anchor PM_API
+   @{
+*/
+/**
+   Enum of all possible performance knob scopes.
+ */
+enum starpu_perf_knob_scope
+{
+	starpu_perf_knob_scope_undefined     = 0, /** undefined scope */
+	starpu_perf_knob_scope_global        = 1, /** global scope */
+	starpu_perf_knob_scope_per_worker    = 3, /** per-worker scope */
+	starpu_perf_knob_scope_per_scheduler = 5  /** per-scheduler scope */
+};
+
+/**
+  Enum of all possible performance knob value type.
+ */
+enum starpu_perf_knob_type
+{
+	starpu_perf_knob_type_undefined = 0, /** underfined value type */
+	starpu_perf_knob_type_int32     = 1, /** signed 32-bit integer value */
+	starpu_perf_knob_type_int64     = 2, /** signed 64-bit integer value */
+	starpu_perf_knob_type_float     = 3, /** 32-bit single precision floating-point value */
+	starpu_perf_knob_type_double    = 4  /** 64-bit double precision floating-point value */
+};
+
+/**
+  Translate scope name constant string to scope id.
+  */
+int starpu_perf_knob_scope_name_to_id(const char *name);
+/**
+  Translate scope id to scope name constant string.
+  */
+const char *starpu_perf_knob_scope_id_to_name(enum starpu_perf_knob_scope scope);
+
+/**
+  Translate type name constant string to type id.
+  */
+int starpu_perf_knob_type_name_to_id(const char *name);
+/**
+  Translate type id to type name constant string.
+  */
+const char *starpu_perf_knob_type_id_to_name(enum starpu_perf_knob_type type);
+
+/**
+  Return the number of performance steering knobs for the given scope.
+  */
+int starpu_perf_knob_nb(enum starpu_perf_knob_scope scope);
+/**
+  Translate a performance knob name to its id.
+  */
+int starpu_perf_knob_name_to_id(enum starpu_perf_knob_scope scope, const char *name);
+/**
+  Translate a performance knob name to its id.
+  */
+int starpu_perf_knob_nth_to_id(enum starpu_perf_knob_scope scope, int nth);
+/**
+  Translate a performance knob rank in its scope to its knob id.
+  */
+const char *starpu_perf_knob_id_to_name(int id);
+/**
+  Translate a knob id to its name constant string.
+  */
+int starpu_perf_knob_get_type_id(int id);
+/**
+  Return the knob's help string.
+  */
+const char *starpu_perf_knob_get_help_string(int id);
+
+/**
+  Display the list of knobs defined in the given scope.
+  */
+void starpu_perf_knob_list_avail(enum starpu_perf_knob_scope scope);
+/**
+  Display the list of knobs defined in all scopes.
+  */
+void starpu_perf_knob_list_all_avail(void);
+
+/**
+  Get knob value for Global scope.
+  */
+int32_t starpu_perf_knob_get_global_int32_value (const int knob_id);
+/**
+  Get knob value for Global scope.
+  */
+int64_t starpu_perf_knob_get_global_int64_value (const int knob_id);
+/**
+  Get knob value for Global scope.
+  */
+float   starpu_perf_knob_get_global_float_value (const int knob_id);
+/**
+  Get knob value for Global scope.
+  */
+double  starpu_perf_knob_get_global_double_value(const int knob_id);
+
+/**
+  Set int32 knob value for Global scope.
+  */
+void starpu_perf_knob_set_global_int32_value (const int knob_id, int32_t new_value);
+/**
+  Set int64 knob value for Global scope.
+  */
+void starpu_perf_knob_set_global_int64_value (const int knob_id, int64_t new_value);
+/**
+  Set float knob value for Global scope.
+  */
+void starpu_perf_knob_set_global_float_value (const int knob_id, float   new_value);
+/**
+  Set double knob value for Global scope.
+  */
+void starpu_perf_knob_set_global_double_value(const int knob_id, double  new_value);
+
+
+/**
+ Get int32 value for Per_worker scope.
+  */
+int32_t starpu_perf_knob_get_per_worker_int32_value (const int knob_id, unsigned workerid);
+/**
+ Get int64 value for Per_worker scope.
+  */
+int64_t starpu_perf_knob_get_per_worker_int64_value (const int knob_id, unsigned workerid);
+/**
+ Get float value for Per_worker scope.
+  */
+float   starpu_perf_knob_get_per_worker_float_value (const int knob_id, unsigned workerid);
+/**
+ Get double value for Per_worker scope.
+  */
+double  starpu_perf_knob_get_per_worker_double_value(const int knob_id, unsigned workerid);
+
+/**
+ Set int32 value for Per_worker scope.
+  */
+void starpu_perf_knob_set_per_worker_int32_value (const int knob_id, unsigned workerid, int32_t new_value);
+/**
+ Set int64 value for Per_worker scope.
+  */
+void starpu_perf_knob_set_per_worker_int64_value (const int knob_id, unsigned workerid, int64_t new_value);
+/**
+ Set float value for Per_worker scope.
+  */
+void starpu_perf_knob_set_per_worker_float_value (const int knob_id, unsigned workerid, float   new_value);
+/**
+ Set double value for Per_worker scope.
+  */
+void starpu_perf_knob_set_per_worker_double_value(const int knob_id, unsigned workerid, double  new_value);
+
+
+/**
+ Get int32 value for per_scheduler scope.
+  */
+int32_t starpu_perf_knob_get_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name);
+/**
+ Get int64 value for per_scheduler scope.
+  */
+int64_t starpu_perf_knob_get_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name);
+/**
+ Get float value for per_scheduler scope.
+  */
+float   starpu_perf_knob_get_per_scheduler_float_value (const int knob_id, const char * sched_policy_name);
+/**
+ Get double value for per_scheduler scope.
+  */
+double  starpu_perf_knob_get_per_scheduler_double_value(const int knob_id, const char * sched_policy_name);
+
+/**
+ Set int32 value for per_scheduler scope.
+  */
+void starpu_perf_knob_set_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name, int32_t new_value);
+/**
+ Set int64 value for per_scheduler scope.
+  */
+void starpu_perf_knob_set_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name, int64_t new_value);
+/**
+ Set float value for per_scheduler scope.
+  */
+void starpu_perf_knob_set_per_scheduler_float_value (const int knob_id, const char * sched_policy_name, float   new_value);
+/**
+ Set double value for per_scheduler scope.
+  */
+void starpu_perf_knob_set_per_scheduler_double_value(const int knob_id, const char * sched_policy_name, double  new_value);
+
+/** @} */
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_PERF_STEERING_H__ */

+ 76 - 2
include/starpu_sched_component.h

@@ -373,6 +373,11 @@ void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsi
 */
 
 /**
+   default function for the pull component method, just call pull of parents until one of them returns a task
+*/
+struct starpu_task * starpu_sched_component_parents_pull_task(struct starpu_sched_component * component, struct starpu_sched_component * to);
+
+/**
    default function for the can_push component method, just call can_push of parents until one of them returns non-zero
 */
 int starpu_sched_component_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to);
@@ -416,6 +421,7 @@ struct starpu_sched_component_fifo_data
 {
 	unsigned ntasks_threshold;
 	double exp_len_threshold;
+	int ready;
 };
 
 /**
@@ -441,6 +447,7 @@ struct starpu_sched_component_prio_data
 {
 	unsigned ntasks_threshold;
 	double exp_len_threshold;
+	int ready;
 };
 struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_prio(struct starpu_sched_component *component);
@@ -494,6 +501,18 @@ int starpu_sched_component_is_random(struct starpu_sched_component *);
 struct starpu_sched_component *starpu_sched_component_eager_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_eager(struct starpu_sched_component *);
 
+/** @} */
+
+/**
+   @name Resource-mapping Eager Prio Component API
+   @{
+*/
+
+struct starpu_sched_component *starpu_sched_component_eager_prio_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+int starpu_sched_component_is_eager_prio(struct starpu_sched_component *);
+
+/** @} */
+
 /**
    @name Resource-mapping Eager-Calibration Component API
    @{
@@ -539,6 +558,22 @@ int starpu_sched_component_is_heft(struct starpu_sched_component *component);
 /** @} */
 
 /**
+   @name Resource-mapping Heteroprio Component API
+   @{
+*/
+
+struct starpu_sched_component_heteroprio_data
+{
+	struct starpu_sched_component_mct_data *mct;
+	unsigned batch;
+};
+
+struct starpu_sched_component * starpu_sched_component_heteroprio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_heteroprio_data * params) STARPU_ATTRIBUTE_MALLOC;
+int starpu_sched_component_is_heteroprio(struct starpu_sched_component *component);
+
+/** @} */
+
+/**
    @name Special-purpose Best_Implementation Component API
    @{
 */
@@ -570,6 +605,26 @@ int starpu_sched_component_is_perfmodel_select(struct starpu_sched_component *co
 /** @} */
 
 /**
+   @name Staged pull Component API
+   @{
+*/
+
+struct starpu_sched_component * starpu_sched_component_stage_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+int starpu_sched_component_is_stage(struct starpu_sched_component *component);
+
+/** @} */
+
+/**
+   @name User-choice push Component API
+   @{
+*/
+
+struct starpu_sched_component * starpu_sched_component_userchoice_create(struct starpu_sched_tree *tree, void *arg) STARPU_ATTRIBUTE_MALLOC;
+int starpu_sched_component_is_userchoice(struct starpu_sched_component *component);
+
+/** @} */
+
+/**
    @name Recipe Component API
    @{
 */
@@ -709,14 +764,19 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_PRIO	(1<<9)
 
 /**
+   Request that the fifos below be pulled rather ready tasks
+*/
+#define STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY	(1<<10)
+
+/**
    Request that work between workers using the same fifo below be distributed using a work stealing component.
 */
-#define STARPU_SCHED_SIMPLE_WS_BELOW		(1<<10)
+#define STARPU_SCHED_SIMPLE_WS_BELOW		(1<<11)
 
 /**
    Request to not only choose between simple workers, but also choose between combined workers.
 */
-#define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<11)
+#define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<12)
 
 /**
    Create a simple modular scheduler tree around a scheduling decision-making
@@ -727,6 +787,20 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 */
 void starpu_sched_component_initialize_simple_scheduler(starpu_sched_component_create_t create_decision_component, void *data, unsigned flags, unsigned sched_ctx_id);
 
+/**
+   Create a simple modular scheduler tree around several scheduling decision-making
+   components. The parameters are similar to
+   starpu_sched_component_initialize_simple_scheduler, but per scheduling decision, for instance:
+
+   starpu_sched_component_initialize_simple_schedulers(sched_ctx_id, 2,
+     create1, data1, flags1,
+     create2, data2, flags2);
+
+   The different flags parameters must be coherent: same decision flags. They
+   must not include the perfmodel flag (not supported yet).
+*/
+void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id, unsigned ndecisions, ...);
+
 /** @} */
 
 #define STARPU_COMPONENT_MUTEX_LOCK(m) \

+ 50 - 17
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017                                Inria
+ * Copyright (C) 2011-2017,2019                           Inria
  * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -565,6 +565,9 @@ struct starpu_codelet
 	   Various flags for the codelet.
 	 */
 	int flags;
+
+	struct starpu_perf_counter_sample *perf_counter_sample;
+	struct starpu_perf_counter_sample_cl_values *perf_counter_values;
 };
 
 /**
@@ -759,7 +762,9 @@ struct starpu_task
 
 	   With starpu_task_insert() and alike this can be specified thanks to
 	   ::STARPU_CALLBACK followed by the function pointer, or thanks to
-	   ::STARPU_CALLBACK_WITH_ARG followed by the function pointer and the argument.
+	   ::STARPU_CALLBACK_WITH_ARG (or
+	   ::STARPU_CALLBACK_WITH_ARG_NFREE) followed by the function
+	   pointer and the argument.
 	*/
 	void (*callback_func)(void *);
 	/**
@@ -770,7 +775,9 @@ struct starpu_task
 
 	   With starpu_task_insert() and alike this can be specified thanks to
 	   ::STARPU_CALLBACK_ARG followed by the function pointer, or thanks to
-	   ::STARPU_CALLBACK_WITH_ARG followed by the function pointer and the argument.
+	   ::STARPU_CALLBACK_WITH_ARG or
+	   ::STARPU_CALLBACK_WITH_ARG_NFREE followed by the function
+	   pointer and the argument.
 	*/
 	void *callback_arg;
 
@@ -788,6 +795,7 @@ struct starpu_task
 	   ::STARPU_PROLOGUE_CALLBACK followed by the function pointer.
 	*/
 	void (*prologue_callback_func)(void *);
+
 	/**
 	   Optional field, the default value is <c>NULL</c>. This is
 	   the pointer passed to the prologue callback function. This
@@ -795,7 +803,7 @@ struct starpu_task
 	   starpu_task::prologue_callback_func is set to <c>NULL</c>.
 
 	   With starpu_task_insert() and alike this can be specified thanks to
-	   ::STARPU_PROLOGUE_CALLBACK followed by the function pointer.
+	   ::STARPU_PROLOGUE_CALLBACK_ARG followed by the argument
 	*/
 	void *prologue_callback_arg;
 
@@ -826,6 +834,7 @@ struct starpu_task
 	   ::STARPU_CL_ARGS.
 	*/
 	unsigned cl_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::callback_arg was
 	   allocated by the application through <c>malloc()</c>,
@@ -833,9 +842,12 @@ struct starpu_task
 	   automatically call <c>free(callback_arg)</c> when
 	   destroying the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this is set to 1 when using
+	   ::STARPU_CALLBACK_ARG or ::STARPU_CALLBACK_WITH_ARG, or set
+	   to 0 when using ::STARPU_CALLBACK_ARG_NFREE
 	*/
 	unsigned callback_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::prologue_callback_arg
 	   was allocated by the application through <c>malloc()</c>,
@@ -843,9 +855,12 @@ struct starpu_task
 	   StarPU automatically call
 	   <c>free(prologue_callback_arg)</c> when destroying the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this is set to 1 when using
+	   ::STARPU_PROLOGUE_CALLBACK_ARG, or set to 0 when using
+	   ::STARPU_PROLOGUE_CALLBACK_ARG_NFREE
 	*/
 	unsigned prologue_callback_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::prologue_callback_pop_arg
 	   was allocated by the application through <c>malloc()</c>,
@@ -854,7 +869,9 @@ struct starpu_task
 	   <c>free(prologue_callback_pop_arg)</c> when destroying the
 	   task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this is set to 1 when using
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG, or set to 0 when using
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
 	*/
 	unsigned prologue_callback_pop_arg_free:1;
 
@@ -877,7 +894,8 @@ struct starpu_task
 	   this flag permits to disable sequential consistency for
 	   this task, even if data have it enabled.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_SEQUENTIAL_CONSISTENCY followed by an unsigned.
 	*/
 	unsigned sequential_consistency:1;
 
@@ -951,7 +969,9 @@ struct starpu_task
 	/**
 	   do not allocate a submitorder id for this task
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified
+	   thanks to ::STARPU_TASK_NO_SUBMITORDER followed by
+	   an unsigned.
 	*/
 	unsigned no_submitorder:1;
 
@@ -1006,7 +1026,10 @@ struct starpu_task
 	   workers which are allowed to execute the task.
 	   starpu_task::workerid takes precedence over this.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this can be specified
+	   along the field workerids_len thanks to ::STARPU_TASK_WORKERIDS
+	   followed by a number of workers and an array of bits which
+	   size is the number of workers.
 	*/
 	uint32_t *workerids;
 
@@ -1014,7 +1037,10 @@ struct starpu_task
 	   Optional field. This provides the number of uint32_t values
 	   in the starpu_task::workerids array.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this can be specified
+	   along the field workerids thanks to ::STARPU_TASK_WORKERIDS
+	   followed by a number of workers and an array of bits which
+	   size is the number of workers.
 	*/
 	unsigned workerids_len;
 
@@ -1109,7 +1135,9 @@ struct starpu_task
 	/**
 	   Optional field. Profiling information for the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_TASK_PROFILING_INFO followed by a pointer to the
+	   appropriate struct.
 	*/
 	struct starpu_profiling_task_info *profiling_info;
 
@@ -1182,6 +1210,9 @@ struct starpu_task
 	/**
 	   This field is managed by the scheduler, is it allowed to do
 	   whatever with it.  Typically, some area would be allocated on push, and released on pop.
+
+	   With starpu_task_insert() and alike this is set when using
+	   ::STARPU_TASK_SCHED_DATA.
 	*/
 	void *sched_data;
 };
@@ -1195,7 +1226,8 @@ struct starpu_task
    equivalent to initializing a structure starpu_task
    with the function starpu_task_init().
 */
-/* Note: remember to update starpu_task_init as well */
+/* Note: remember to update starpu_task_init and starpu_task_ft_create_retry
+ * as well */
 #define STARPU_TASK_INITIALIZER 			\
 {							\
 	.cl = NULL,					\
@@ -1552,13 +1584,15 @@ void starpu_task_set_implementation(struct starpu_task *task, unsigned impl);
 unsigned starpu_task_get_implementation(struct starpu_task *task);
 
 /**
-   Create (and submit) an empty task that unlocks a tag once all its
+   Create and submit an empty task that unlocks a tag once all its
    dependencies are fulfilled.
  */
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
-
-
+/**
+   Create and submit an empty task with the given callback
+ */
+void starpu_create_callback_task(void (*callback)(void *), void *callback_arg);
 
 /**
    Function to be used as a prologue callback to enable fault tolerance for the
@@ -1573,7 +1607,6 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
  */
 void starpu_task_ft_prologue(void *check_ft);
 
-
 /**
    Create a try-task for a \p meta_task, given a \p template_task task
    template. The meta task can be passed as template on the first call, but

+ 92 - 2
include/starpu_task_util.h

@@ -57,7 +57,7 @@ extern "C"
    Used when calling starpu_task_insert(), must be followed by two
    pointers: one to a callback function, and the other to be given as
    an argument to the callback function; this is equivalent to using
-   both ::STARPU_CALLBACK and ::STARPU_CALLBACK_WITH_ARG.
+   both ::STARPU_CALLBACK and ::STARPU_CALLBACK_ARG.
 */
 #define STARPU_CALLBACK_WITH_ARG (3<<STARPU_MODE_SHIFT)
 
@@ -112,9 +112,30 @@ extern "C"
 */
 #define STARPU_SCHED_CTX	 (13<<STARPU_MODE_SHIFT)
 
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a prologue callback function
+*/
 #define STARPU_PROLOGUE_CALLBACK   (14<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the prologue callback
+   function
+*/
 #define STARPU_PROLOGUE_CALLBACK_ARG (15<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a prologue callback pop function
+*/
 #define STARPU_PROLOGUE_CALLBACK_POP   (16<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the prologue callback pop
+   function
+*/
 #define STARPU_PROLOGUE_CALLBACK_POP_ARG (17<<STARPU_MODE_SHIFT)
 
 /**
@@ -220,7 +241,76 @@ extern "C"
 */
 #define STARPU_TASK_END_DEP	(32<<STARPU_MODE_SHIFT)
 
-#define STARPU_SHIFTED_MODE_MAX (33<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_task_insert(), must be followed by an
+   unsigned being a number of workers, and an array of bits which size
+   is the number of workers, the array indicates the set of workers
+   which are allowed to execute the task.
+*/
+#define STARPU_TASK_WORKERIDS (33<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by an
+   unsigned which sets the sequential consistency for the data
+   parameters of the task.
+*/
+#define STARPU_SEQUENTIAL_CONSISTENCY (34<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert() and alike, must be followed
+   by a pointer to a struct starpu_profiling_task_info
+ */
+#define STARPU_TASK_PROFILING_INFO (35<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert() and alike, must be followed
+   by an unsigned specifying not to allocate a submitorder id for the task
+ */
+#define STARPU_TASK_NO_SUBMITORDER (36<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_CALLBACK_ARG, must be followed by a pointer to be given as
+   an argument to the callback function, the argument will not be
+   freed, i.e starpu_task::callback_arg_free will be set to 0
+*/
+#define STARPU_CALLBACK_ARG_NFREE	 (37<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_CALLBACK_WITH_ARG, must be followed by two pointers: one
+   to a callback function, and the other to be given as an argument to
+   the callback function; this is equivalent to using both
+   ::STARPU_CALLBACK and ::STARPU_CALLBACK_ARG_NFREE.
+*/
+#define STARPU_CALLBACK_WITH_ARG_NFREE	 (38<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_PROLOGUE_CALLBACK_ARG, must be followed by a
+   pointer to be given as an argument to the prologue callback
+   function, the argument will not be
+   freed, i.e starpu_task::prologue_callback_arg_free will be set to 0
+*/
+#define STARPU_PROLOGUE_CALLBACK_ARG_NFREE (39<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_PROLOGUE_CALLBACK_POP_ARG, must be followed by a pointer
+   to be given as an argument to the prologue callback pop function,
+   the argument will not be freed, i.e
+   starpu_task::prologue_callback_pop_arg_free will be set to 0
+*/
+#define STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE (40<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert() and alike, must be followed
+   by a void* specifying the value to be set in the sched_data field of the
+   task.
+ */
+#define STARPU_TASK_SCHED_DATA (41<<STARPU_MODE_SHIFT)
+
+#define STARPU_SHIFTED_MODE_MAX (42<<STARPU_MODE_SHIFT)
 
 /**
    Set the given \p task corresponding to \p cl with the following arguments.

+ 51 - 3
include/starpu_thread.h

@@ -30,6 +30,21 @@
 #else
 #include <xbt/synchro_core.h>
 #endif
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+#include <simgrid/actor.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#include <simgrid/semaphore.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+#include <simgrid/mutex.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+#include <simgrid/cond.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_BARRIER_H
+#include <simgrid/barrier.h>
+#endif
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #include <simgrid/msg.h>
 #else
@@ -52,12 +67,21 @@ extern "C"
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+typedef sg_actor_t starpu_pthread_t;
+#else
 typedef msg_process_t starpu_pthread_t;
+#endif
 typedef int starpu_pthread_attr_t;
 
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+typedef sg_host_t starpu_sg_host_t;
+#else
+typedef msg_host_t starpu_sg_host_t;
+#endif
 int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
 starpu_pthread_t starpu_pthread_self(void);
-int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
+int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host);
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -97,7 +121,11 @@ typedef pthread_attr_t starpu_pthread_attr_t;
  */
 
 #ifdef STARPU_SIMGRID
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+typedef sg_mutex_t starpu_pthread_mutex_t;
+#else
 typedef xbt_mutex_t starpu_pthread_mutex_t;
+#endif
 typedef int starpu_pthread_mutexattr_t;
 
 #define STARPU_PTHREAD_MUTEX_INITIALIZER NULL
@@ -173,7 +201,11 @@ typedef pthread_key_t starpu_pthread_key_t;
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+typedef sg_cond_t starpu_pthread_cond_t;
+#else
 typedef xbt_cond_t starpu_pthread_cond_t;
+#endif
 typedef int starpu_pthread_condattr_t;
 #define STARPU_PTHREAD_COND_INITIALIZER NULL
 
@@ -211,7 +243,11 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+typedef sg_mutex_t starpu_pthread_rwlock_t;
+#else
 typedef xbt_mutex_t starpu_pthread_rwlock_t;
+#endif
 typedef int starpu_pthread_rwlockattr_t;
 
 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr);
@@ -252,10 +288,18 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 #if defined(STARPU_SIMGRID) || (!defined(STARPU_HAVE_PTHREAD_BARRIER) && (!defined(_MSC_VER) || defined(BUILDING_STARPU)))
 
-#if defined(STARPU_SIMGRID) && (defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init))
+#if defined(STARPU_SIMGRID) && (defined(STARPU_HAVE_SIMGRID_BARRIER_H) || defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init))
+#ifdef STARPU_HAVE_SIMGRID_BARRIER_H
+typedef sg_bar_t starpu_pthread_barrier_t;
+#else
 typedef xbt_bar_t starpu_pthread_barrier_t;
+#endif
 typedef int starpu_pthread_barrierattr_t;
-#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
+#ifdef SG_BARRIER_SERIAL_THREAD
+#  define STARPU_PTHREAD_BARRIER_SERIAL_THREAD SG_BARRIER_SERIAL_THREAD
+#else
+#  define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
+#endif
 #else
 typedef struct {
 	starpu_pthread_mutex_t mutex;
@@ -419,7 +463,11 @@ int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+typedef sg_sem_t starpu_sem_t;
+#else
 typedef msg_sem_t starpu_sem_t;
+#endif
 int starpu_sem_destroy(starpu_sem_t *);
 int starpu_sem_getvalue(starpu_sem_t *, int *);
 int starpu_sem_init(starpu_sem_t *, int, unsigned);

+ 99 - 7
include/starpu_util.h

@@ -322,13 +322,19 @@ extern "C"
 			STARPU_ABORT(); }}
 #endif
 
-#if defined(__i386__) || defined(__x86_64__)
+/* Note: do not use starpu_cmpxchg / starpu_xchg / starpu_cmpxchgl /
+ * starpu_xchgl / starpu_cmpxchg64 / starpu_xchg64, which only
+ * assembly-hand-written fallbacks used when building with an old gcc.
+ * Rather use STARPU_VAL_COMPARE_AND_SWAP available on all platforms with a
+ * recent-enough gcc */
 
+#if defined(__i386__) || defined(__x86_64__)
 static __starpu_inline unsigned starpu_cmpxchg(unsigned *ptr, unsigned old, unsigned next)
 {
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
+#define STARPU_HAVE_CMPXCHG
 static __starpu_inline unsigned starpu_xchg(unsigned *ptr, unsigned next)
 {
 	/* Note: xchg is always locked already */
@@ -337,12 +343,27 @@ static __starpu_inline unsigned starpu_xchg(unsigned *ptr, unsigned next)
 }
 #define STARPU_HAVE_XCHG
 
+static __starpu_inline uint32_t starpu_cmpxchg32(uint32_t *ptr, uint32_t old, uint32_t next)
+{
+	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
+	return old;
+}
+#define STARPU_HAVE_CMPXCHG32
+static __starpu_inline uint32_t starpu_xchg32(uint32_t *ptr, uint32_t next)
+{
+	/* Note: xchg is always locked already */
+	__asm__ __volatile__("xchgl %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
+	return next;
+}
+#define STARPU_HAVE_XCHG32
+
 #if defined(__i386__)
 static __starpu_inline unsigned long starpu_cmpxchgl(unsigned long *ptr, unsigned long old, unsigned long next)
 {
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
+#define STARPU_HAVE_CMPXCHGL
 static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned long next)
 {
 	/* Note: xchg is always locked already */
@@ -358,6 +379,7 @@ static __starpu_inline unsigned long starpu_cmpxchgl(unsigned long *ptr, unsigne
 	__asm__ __volatile__("lock cmpxchgq %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
+#define STARPU_HAVE_CMPXCHGL
 static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned long next)
 {
 	/* Note: xchg is always locked already */
@@ -367,6 +389,33 @@ static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned l
 #define STARPU_HAVE_XCHGL
 #endif
 
+#if defined(__i386__)
+static __starpu_inline uint64_t starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
+{
+	uint32_t next_hi = next >> 32;
+	uint32_t next_lo = next & 0xfffffffful;
+	__asm__ __volatile__("lock cmpxchg8b %1": "+A" (old), "+m" (*ptr) : "c" (next_hi), "b" (next_lo) : "memory");
+	return old;
+}
+#define STARPU_HAVE_CMPXCHG64
+#endif
+
+#if defined(__x86_64__)
+static __starpu_inline uint64_t starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
+{
+	__asm__ __volatile__("lock cmpxchgq %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
+	return old;
+}
+#define STARPU_HAVE_CMPXCHG64
+static __starpu_inline uint64_t starpu_xchg64(uint64_t *ptr, uint64_t next)
+{
+	/* Note: xchg is always locked already */
+	__asm__ __volatile__("xchgq %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
+	return next;
+}
+#define STARPU_HAVE_XCHG64
+#endif
+
 #endif
 
 #define STARPU_ATOMIC_SOMETHING(name,expr) \
@@ -395,47 +444,90 @@ static __starpu_inline unsigned long starpu_atomic_##name##l(unsigned long *ptr,
 	}; \
 	return expr; \
 }
+#define STARPU_ATOMIC_SOMETHING64(name,expr) \
+static __starpu_inline uint64_t starpu_atomic_##name##64(uint64_t *ptr, uint64_t value) \
+{ \
+	uint64_t old, next; \
+	while (1) \
+	{ \
+		old = *ptr; \
+		next = expr; \
+		if (starpu_cmpxchg64(ptr, old, next) == old) \
+			break; \
+	}; \
+	return expr; \
+}
 
 /* Returns the new value */
 #ifdef STARPU_HAVE_SYNC_FETCH_AND_ADD
 #define STARPU_ATOMIC_ADD(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
 #define STARPU_ATOMIC_ADDL(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
+#define STARPU_ATOMIC_ADD64(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
 #else
-#if defined(STARPU_HAVE_XCHG)
+#if defined(STARPU_HAVE_CMPXCHG)
 STARPU_ATOMIC_SOMETHING(add, old + value)
 #define STARPU_ATOMIC_ADD(ptr, value) starpu_atomic_add(ptr, value)
 #endif
-#if defined(STARPU_HAVE_XCHGL)
+#if defined(STARPU_HAVE_CMPXCHGL)
 STARPU_ATOMIC_SOMETHINGL(add, old + value)
 #define STARPU_ATOMIC_ADDL(ptr, value) starpu_atomic_addl(ptr, value)
 #endif
+#if defined(STARPU_HAVE_CMPXCHG64)
+STARPU_ATOMIC_SOMETHING64(add, old + value)
+#define STARPU_ATOMIC_ADD64(ptr, value) starpu_atomic_add64(ptr, value)
+#endif
 #endif
 
 #ifdef STARPU_HAVE_SYNC_FETCH_AND_OR
 #define STARPU_ATOMIC_OR(ptr, value)  (__sync_fetch_and_or ((ptr), (value)))
 #define STARPU_ATOMIC_ORL(ptr, value)  (__sync_fetch_and_or ((ptr), (value)))
+#define STARPU_ATOMIC_OR64(ptr, value)  (__sync_fetch_and_or ((ptr), (value)))
 #else
-#if defined(STARPU_HAVE_XCHG)
+#if defined(STARPU_HAVE_CMPXCHG)
 STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_ATOMIC_OR(ptr, value) starpu_atomic_or(ptr, value)
 #endif
-#if defined(STARPU_HAVE_XCHGL)
+#if defined(STARPU_HAVE_CMPXCHGL)
 STARPU_ATOMIC_SOMETHINGL(or, old | value)
 #define STARPU_ATOMIC_ORL(ptr, value) starpu_atomic_orl(ptr, value)
 #endif
+#if defined(STARPU_HAVE_CMPXCHG64)
+STARPU_ATOMIC_SOMETHING64(or, old | value)
+#define STARPU_ATOMIC_OR64(ptr, value) starpu_atomic_or64(ptr, value)
+#endif
 #endif
 
 #ifdef STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP
 #define STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value)  (__sync_bool_compare_and_swap ((ptr), (old), (value)))
-#elif defined(STARPU_HAVE_XCHG)
+#define STARPU_BOOL_COMPARE_AND_SWAP32(ptr, old, value) STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value)
+#define STARPU_BOOL_COMPARE_AND_SWAP64(ptr, old, value) STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value)
+#else
+#ifdef STARPU_HAVE_CMPXCHG
 #define STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)) == (old))
 #endif
+#ifdef STARPU_HAVE_CMPXCHG32
+#define STARPU_BOOL_COMPARE_AND_SWAP32(ptr, old, value) (starpu_cmpxchg32((ptr), (old), (value)) == (old))
+#endif
+#ifdef STARPU_HAVE_CMPXCHG64
+#define STARPU_BOOL_COMPARE_AND_SWAP64(ptr, old, value) (starpu_cmpxchg64((ptr), (old), (value)) == (old))
+#endif
+#endif
 
 #ifdef STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP
 #define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value)  (__sync_val_compare_and_swap ((ptr), (old), (value)))
-#elif defined(STARPU_HAVE_XCHG)
+#define STARPU_VAL_COMPARE_AND_SWAP32(ptr, old, value) STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value)
+#define STARPU_VAL_COMPARE_AND_SWAP64(ptr, old, value) STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value)
+#else
+#ifdef STARPU_HAVE_CMPXCHG
 #define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)))
 #endif
+#ifdef STARPU_HAVE_CMPXCHG32
+#define STARPU_VAL_COMPARE_AND_SWAP32(ptr, old, value) (starpu_cmpxchg32((ptr), (old), (value)))
+#endif
+#ifdef STARPU_HAVE_CMPXCHG64
+#define STARPU_VAL_COMPARE_AND_SWAP64(ptr, old, value) (starpu_cmpxchg64((ptr), (old), (value)))
+#endif
+#endif
 
 /* Returns the previous value */
 #ifdef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET

+ 2 - 2
include/starpu_worker.h

@@ -45,11 +45,11 @@ enum starpu_node_kind
 	STARPU_UNUSED=0,
 	STARPU_CPU_RAM=1,
 	STARPU_CUDA_RAM=2,
-        STARPU_FPGA_RAM=4,
 	STARPU_OPENCL_RAM=3,
 	STARPU_DISK_RAM=4,
 	STARPU_MIC_RAM=5,
 	STARPU_MPI_MS_RAM=6
+	STARPU_FPGA_RAM=7,
 };
 
 /**
@@ -490,7 +490,7 @@ int starpu_combined_worker_get_size(void);
 
 /**
    Return the rank of the current thread within the combined worker.
-   Can only be used in ::STARPU_FORKJOIN parallel tasks, to know which
+   Can only be used in ::STARPU_SPMD parallel tasks, to know which
    part of the task to work on.
 */
 int starpu_combined_worker_get_rank(void);

+ 20 - 0
mpi/examples/Makefile.am

@@ -140,6 +140,12 @@ examplebin_PROGRAMS += 			\
 	mpi_lu/plu_outofcore_example_float	\
 	mpi_lu/plu_outofcore_example_double
 
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES	+=	\
+	mpi_lu/plu_implicit_example_float	\
+	mpi_lu/plu_implicit_example_double
+endif
+
 mpi_lu_plu_example_float_LDADD =	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
@@ -279,6 +285,7 @@ if BUILD_EXAMPLES
 if !STARPU_SANITIZE
 examplebin_PROGRAMS +=		\
 	native_fortran/nf_mm	\
+	native_fortran/nf_mm_task_build	\
 	native_fortran/nf_basic_ring
 
 native_fortran_nf_mm_SOURCES	=			\
@@ -290,6 +297,15 @@ native_fortran_nf_mm_SOURCES	=			\
 native_fortran_nf_mm_LDADD =					\
 	-lm
 
+native_fortran_nf_mm_task_build_SOURCES	=			\
+	native_fortran/nf_mm_cl.f90			\
+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
+	$(top_srcdir)/include/fstarpu_mod.f90		\
+	native_fortran/nf_mm_task_build.f90
+
+native_fortran_nf_mm_task_build_LDADD =					\
+	-lm
+
 native_fortran_nf_basic_ring_SOURCES	=			\
 	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
 	$(top_srcdir)/include/fstarpu_mod.f90		\
@@ -301,6 +317,7 @@ native_fortran_nf_basic_ring_LDADD =					\
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	native_fortran/nf_mm			\
+	native_fortran/nf_mm_task_build		\
 	native_fortran/nf_basic_ring
 endif
 endif
@@ -396,6 +413,9 @@ nf_mm_cl.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm_cl.f90 fstarpu_mpi_m
 nf_mm.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm.f90 nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
 	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm.f90' || echo '$(srcdir)/'`native_fortran/nf_mm.f90
 
+nf_mm_task_build.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm_task_build.f90 nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm__task_build_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm_task_build.f90' || echo '$(srcdir)/'`native_fortran/nf_mm_task_build.f90
+
 nf_basic_ring.o: $(top_srcdir)/mpi/examples/native_fortran/nf_basic_ring.f90 fstarpu_mpi_mod.mod fstarpu_mod.mod
 	$(AM_V_FC)$(FC) $(native_fortran_nf_basic_ring_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_basic_ring.f90' || echo '$(srcdir)/'`native_fortran/nf_basic_ring.f90
 endif

+ 2 - 1
mpi/examples/complex/mpi_complex.c

@@ -112,11 +112,12 @@ int main(int argc, char **argv)
 	}
 	else if (rank == 1)
 	{
+		MPI_Status status;
 		starpu_data_handle_t xhandle;
 		double xreal = 14.0;
 		double ximaginary = 18.0;
 		starpu_complex_data_register(&xhandle, STARPU_MAIN_RAM, &xreal, &ximaginary, 1);
-		starpu_mpi_recv(xhandle, 0, 10, MPI_COMM_WORLD, NULL);
+		starpu_mpi_recv(xhandle, 0, 10, MPI_COMM_WORLD, &status);
 		starpu_data_unregister(xhandle);
 		FPRINTF(stderr, "[received] real %f imaginary %f\n", xreal, ximaginary);
 		STARPU_ASSERT_MSG(xreal == 4 && ximaginary == 8, "Incorrect received value\n");

+ 0 - 5
mpi/examples/filters/filter.c

@@ -168,9 +168,4 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 
 	return ok;
-
-enodev:
-	FPRINTF(stderr, "WARNING: No one can execute this task\n");
-	starpu_shutdown();
-	return 77;
 }

+ 11 - 4
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2010,2011,2013-2015,2017,2018            Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2012,2013                                Inria
@@ -34,8 +34,8 @@
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 
@@ -252,7 +252,14 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 	starpu_cublas_init();
 

+ 10 - 9
mpi/examples/native_fortran/nf_mm.f90

@@ -1,6 +1,6 @@
 ! StarPU --- Runtime system for heterogeneous multicore architectures.
 !
-! Copyright (C) 2017                                     CNRS
+! Copyright (C) 2017, 2019                               CNRS
 ! Copyright (C) 2016                                     Inria
 ! Copyright (C) 2016                                     Université de Bordeaux
 !
@@ -23,7 +23,8 @@ program nf_mm
         implicit none
 
         logical, parameter :: verbose = .false.
-        integer(c_int) :: comm_rank, comm_size, comm_world
+        integer(c_int) :: comm_size, comm_rank
+        integer(c_int), target :: comm_world
         integer(c_int) :: N = 16, BS = 4, NB
         real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
         type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
@@ -166,13 +167,13 @@ program nf_mm
         end do
 
         do b_col=1,NB
-        do b_row=1,NB
-                ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
-                        FSTARPU_R,  dh_A(b_row), &
-                        FSTARPU_R,  dh_B(b_col), &
-                        FSTARPU_RW, dh_C(b_row,b_col), &
-                        C_NULL_PTR /))
-        end do
+           do b_row=1,NB
+              call fstarpu_mpi_task_insert((/ c_loc(comm_world), cl_mm, &
+                   FSTARPU_R,  dh_A(b_row), &
+                   FSTARPU_R,  dh_B(b_col), &
+                   FSTARPU_RW, dh_C(b_row,b_col), &
+                   C_NULL_PTR /))
+           end do
         end do
 
         call fstarpu_task_wait_for_all()

+ 248 - 0
mpi/examples/native_fortran/nf_mm_task_build.f90

@@ -0,0 +1,248 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2017, 2019                               CNRS
+! Copyright (C) 2016                                     Inria
+! Copyright (C) 2016                                     Université de Bordeaux
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program nf_mm
+        use iso_c_binding       ! C interfacing module
+        use fstarpu_mod         ! StarPU interfacing module
+        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module
+        use nf_mm_cl
+        implicit none
+
+        logical, parameter :: verbose = .false.
+        integer(c_int) :: comm_size, comm_rank
+        integer(c_int), target :: comm_world
+        integer(c_int) :: N = 16, BS = 4, NB
+        real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
+        type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
+        type(c_ptr) :: cl_mm
+        type(c_ptr) :: task
+        integer(c_int) :: ncpu
+        integer(c_int) :: ret
+        integer(c_int) :: row, col
+        integer(c_int) :: b_row, b_col
+        integer(c_int) :: mr, tag, rank
+
+        ret = fstarpu_init(C_NULL_PTR)
+        if (ret == -19) then
+                stop 77
+        else if (ret /= 0) then
+                stop 1
+        end if
+
+        ret = fstarpu_mpi_init(1)
+        print *,"fstarpu_mpi_init status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+
+        ! stop there if no CPU worker available
+        ncpu = fstarpu_cpu_worker_get_count()
+        if (ncpu == 0) then
+                call fstarpu_shutdown()
+                stop 77
+        end if
+
+        comm_world = fstarpu_mpi_world_comm()
+        comm_size = fstarpu_mpi_world_size()
+        comm_rank = fstarpu_mpi_world_rank()
+
+        if (comm_size < 2) then
+                call fstarpu_shutdown()
+                ret = fstarpu_mpi_shutdown()
+                stop 77
+        end if
+
+        ! TODO: process app's argc/argv
+        NB = N/BS
+
+        ! allocate and initialize codelet
+        cl_mm = fstarpu_codelet_allocate()
+        call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
+        call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
+
+        ! allocate matrices
+        if (comm_rank == 0) then
+                allocate(A(N,N))
+                allocate(B(N,N))
+                allocate(C(N,N))
+        end if
+
+        ! init matrices
+        if (comm_rank == 0) then
+                do col=1,N
+                do row=1,N
+                if (row == col) then
+                        A(row,col) = 2
+                else
+                        A(row,col) = 0
+                end if
+                B(row,col) = row*N+col
+                C(row,col) = 0
+                end do
+                end do
+
+                if (verbose) then
+                        print *,"A"
+                        call mat_disp(A)
+                        print *,"B"
+                        call mat_disp(B)
+                        print *,"C"
+                        call mat_disp(C)
+                end if
+        end if
+
+        ! allocate data handles
+        allocate(dh_A(NB))
+        allocate(dh_B(NB))
+        allocate(dh_C(NB,NB))
+
+        ! register matrices
+        if (comm_rank == 0) then
+                mr = 0 ! TODO: use STARPU_MAIN_RAM constant
+        else
+                mr = -1
+        end if
+        tag = 0
+
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
+                tag = tag+1
+        end do
+        end do
+
+        ! distribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        rank = modulo(b_row+b_col, comm_size)
+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
+        end do
+        end do
+
+        do b_col=1,NB
+           do b_row=1,NB
+              task = fstarpu_mpi_task_build((/ c_loc(comm_world), cl_mm, &
+                   				FSTARPU_R,  dh_A(b_row), &
+                                                FSTARPU_R,  dh_B(b_col), &
+                                                FSTARPU_RW, dh_C(b_row,b_col), &
+                                                C_NULL_PTR /))
+              if (c_associated(task)) then
+                 ret = fstarpu_task_submit(task)
+              endif
+              call fstarpu_mpi_task_post_build((/ c_loc(comm_world), cl_mm, &
+                   				FSTARPU_R,  dh_A(b_row), &
+                                                FSTARPU_R,  dh_B(b_col), &
+                                                FSTARPU_RW, dh_C(b_row,b_col), &
+                                                C_NULL_PTR /))
+           end do
+        end do
+
+        call fstarpu_task_wait_for_all()
+
+        ! undistribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
+        end do
+        end do
+
+        ! unregister matrices
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_A(b_row))
+        end do
+
+        do b_col=1,NB
+                call fstarpu_data_unregister(dh_B(b_col))
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_C(b_row,b_col))
+        end do
+        end do
+
+        ! check result
+        if (comm_rank == 0) then
+                if (verbose) then
+                        print *,"final C"
+                        call mat_disp(C)
+                end if
+
+                do col=1,N
+                do row=1,N
+                if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
+                        print *, "check failed"
+                        stop 1
+                end if
+                end do
+                end do
+        end if
+
+        ! free handles
+        deallocate(dh_A)
+        deallocate(dh_B)
+        deallocate(dh_C)
+
+        ! free matrices
+        if (comm_rank == 0) then
+                deallocate(A)
+                deallocate(B)
+                deallocate(C)
+        end if
+        call fstarpu_codelet_free(cl_mm)
+        call fstarpu_shutdown()
+
+        ret = fstarpu_mpi_shutdown()
+        print *,"fstarpu_mpi_shutdown status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+end program nf_mm

+ 4 - 1
mpi/examples/user_datatype/my_interface.c

@@ -202,7 +202,10 @@ static starpu_ssize_t data_describe(void *data_interface, char *buf, size_t size
 {
 	struct starpu_my_data_interface *my_data = (struct starpu_my_data_interface *) data_interface;
 	struct starpu_my_data *data = (struct starpu_my_data *)my_data->ptr;
-	return snprintf(buf, size, "Data%d-%c", data->d, data->c);
+	if (data)
+		return snprintf(buf, size, "Data%d-%c", data->d, data->c);
+	else
+		return snprintf(buf, size, "DataUNKNOWN");
 }
 
 static void *data_to_pointer(void *data_interface, unsigned node)

+ 2 - 1
mpi/examples/user_datatype/user_datatype.c

@@ -92,10 +92,11 @@ int main(int argc, char **argv)
 	}
 	else if (rank == 1)
 	{
+		MPI_Status status;
 		struct starpu_my_data myx = {.d = 11 , .c = 'a'};
 		starpu_data_handle_t handlex;
 		starpu_my_data_register(&handlex, STARPU_MAIN_RAM, &myx);
-		starpu_mpi_recv(handlex, 0, 10, MPI_COMM_WORLD, NULL);
+		starpu_mpi_recv(handlex, 0, 10, MPI_COMM_WORLD, &status);
 		starpu_data_unregister(handlex);
 		FPRINTF(stderr, "[starpu mpi] myx.d=%d myx.c=%c\n", myx.d, myx.c);
 		STARPU_ASSERT_MSG(myx.d == 98 && myx.c == 'z', "Incorrect received value\n");

+ 16 - 23
mpi/include/fstarpu_mpi_mod.f90

@@ -1,6 +1,6 @@
 ! StarPU --- Runtime system for heterogeneous multicore architectures.
 !
-! Copyright (C) 2017                                     CNRS
+! Copyright (C) 2017,2019                                CNRS
 ! Copyright (C) 2016                                     Inria
 ! Copyright (C) 2016,2017                                Université de Bordeaux
 !
@@ -247,34 +247,27 @@ module fstarpu_mpi_mod
                 end function fstarpu_mpi_shutdown
 
                 ! struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
-                function fstarpu_mpi_task_build(mpi_comm,arglist) bind(C)
-                        use iso_c_binding, only: c_ptr,c_int
+                function fstarpu_mpi_task_build(arglist) bind(C)
+                        use iso_c_binding, only: c_ptr
                         type(c_ptr) :: fstarpu_mpi_task_build
-                        integer(c_int), value, intent(in) :: mpi_comm
-                        type(c_ptr), dimension(:), intent(in) :: arglist
+                        type(c_ptr), dimension(*), intent(in) :: arglist
                 end function fstarpu_mpi_task_build
 
                 ! int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
-                function fstarpu_mpi_task_post_build(mpi_comm,arglist) bind(C)
-                        use iso_c_binding, only: c_ptr,c_int
-                        integer(c_int) :: fstarpu_mpi_task_post_build
-                        integer(c_int), value, intent(in) :: mpi_comm
-                        type(c_ptr), dimension(:), intent(in) :: arglist
-                end function fstarpu_mpi_task_post_build
+                subroutine fstarpu_mpi_task_post_build(arglist) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), dimension(*), intent(in) :: arglist
+                end subroutine fstarpu_mpi_task_post_build
 
                 ! int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
-                function fstarpu_mpi_task_insert(mpi_comm,arglist) bind(C)
-                        use iso_c_binding, only: c_ptr,c_int
-                        integer(c_int) :: fstarpu_mpi_task_insert
-                        integer(c_int), value, intent(in) :: mpi_comm
-                        type(c_ptr), dimension(:), intent(in) :: arglist
-                end function fstarpu_mpi_task_insert
-                function fstarpu_mpi_insert_task(mpi_comm,arglist) bind(C,name="fstarpu_mpi_task_insert")
-                        use iso_c_binding, only: c_ptr,c_int
-                        integer(c_int) :: fstarpu_mpi_insert_task
-                        integer(c_int), value, intent(in) :: mpi_comm
-                        type(c_ptr), dimension(:), intent(in) :: arglist
-                end function fstarpu_mpi_insert_task
+                subroutine fstarpu_mpi_task_insert(arglist) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), dimension(*), intent(in) :: arglist
+                end subroutine fstarpu_mpi_task_insert
+                subroutine fstarpu_mpi_insert_task(arglist) bind(C,name="fstarpu_mpi_task_insert")
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), dimension(*), intent(in) :: arglist
+                end subroutine fstarpu_mpi_insert_task
 
                 ! void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
                 subroutine fstarpu_mpi_get_data_on_node(mpi_comm,dh,node) bind(C)

+ 7 - 2
mpi/src/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012,2016                                Inria
-# Copyright (C) 2010-2018                                CNRS
+# Copyright (C) 2010-2019                                CNRS
 # Copyright (C) 2009-2014,2018                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -73,6 +73,8 @@ noinst_HEADERS =					\
 	mpi/starpu_mpi_comm.h				\
 	mpi/starpu_mpi_tag.h				\
 	mpi/starpu_mpi_driver.h				\
+	mpi/starpu_mpi_mpi_backend.h			\
+	nmad/starpu_mpi_nmad_backend.h			\
 	load_balancer/policy/data_movements_interface.h	\
 	load_balancer/policy/load_data_interface.h	\
 	load_balancer/policy/load_balancer_policy.h
@@ -94,7 +96,9 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi_task_insert_fortran.c		\
 	starpu_mpi_init.c				\
 	nmad/starpu_mpi_nmad.c				\
+	nmad/starpu_mpi_nmad_backend.c			\
 	mpi/starpu_mpi_mpi.c				\
+	mpi/starpu_mpi_mpi_backend.c			\
 	mpi/starpu_mpi_early_data.c			\
 	mpi/starpu_mpi_early_request.c			\
 	mpi/starpu_mpi_sync_data.c			\
@@ -105,8 +109,9 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	load_balancer/policy/load_heat_propagation.c	\
 	load_balancer/load_balancer.c
 
+recheck:
+	-cat /dev/null
 showcheck:
 	-cat /dev/null
-
 showsuite:
 	-cat /dev/null

+ 2 - 2
mpi/src/load_balancer/policy/load_heat_propagation.c

@@ -290,12 +290,12 @@ static void update_data_ranks()
 				//        fprintf(stderr,"Bring back data %p (tag %d) from node %d on node %d\n", handle, (data_movements_get_tags_table(data_movements_handles[i]))[j], starpu_mpi_data_get_rank(handle), my_rank);
 				//}
 
-				_STARPU_DEBUG("Call of starpu_mpi_get_data_on_node(%"PRIi64"d,%d) on node %d\n", starpu_mpi_data_get_tag(handle), dst_rank, my_rank);
+				_STARPU_DEBUG("Call of starpu_mpi_get_data_on_node(%"PRIi64",%d) on node %d\n", starpu_mpi_data_get_tag(handle), dst_rank, my_rank);
 
 				/* Migrate the data handle */
 				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handle, dst_rank, NULL, NULL);
 
-				_STARPU_DEBUG("New rank (%d) of data %"PRIi64"d upgraded on node %d\n", dst_rank, starpu_mpi_data_get_tag(handle), my_rank);
+				_STARPU_DEBUG("New rank (%d) of data %"PRIi64" upgraded on node %d\n", dst_rank, starpu_mpi_data_get_tag(handle), my_rank);
 				starpu_mpi_data_set_rank_comm(handle, dst_rank, MPI_COMM_WORLD);
 			}
 		}

+ 3 - 1
mpi/src/mpi/starpu_mpi_comm.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015-2017                                CNRS
+ * Copyright (C) 2015-2017, 2019                          CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,8 @@
 
 #ifdef STARPU_USE_MPI_MPI
 
+#include <mpi/starpu_mpi_mpi_backend.h>
+
 #ifdef __cplusplus
 extern "C"
 {

+ 75 - 19
mpi/src/mpi/starpu_mpi_early_data.c

@@ -18,16 +18,18 @@
 #include <stdlib.h>
 #include <starpu_mpi.h>
 #include <mpi/starpu_mpi_early_data.h>
+#include <mpi/starpu_mpi_mpi_backend.h>
 #include <starpu_mpi_private.h>
-#include <common/uthash.h>
 
 #ifdef STARPU_USE_MPI_MPI
 
+/** the hashlist is on 2 levels, the first top level is indexed on (node, rank), the second lower level is indexed on the data tag */
+
 struct _starpu_mpi_early_data_handle_hashlist
 {
-	struct _starpu_mpi_early_data_handle_list list;
+	struct _starpu_mpi_early_data_handle_tag_hashlist *datahash;
 	UT_hash_handle hh;
-	struct _starpu_mpi_node_tag node_tag;
+	struct _starpu_mpi_node node;
 };
 
 /** stores data which have been received by MPI but have not been requested by the application */
@@ -49,7 +51,11 @@ void _starpu_mpi_early_data_check_termination(void)
 		struct _starpu_mpi_early_data_handle_hashlist *current=NULL, *tmp=NULL;
 		HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap, current, tmp)
 		{
-			_STARPU_MSG("Unexpected message with comm %ld source %d tag %ld\n", (long int)current->node_tag.comm, current->node_tag.rank, current->node_tag.data_tag);
+			struct _starpu_mpi_early_data_handle_tag_hashlist *tag_current=NULL, *tag_tmp=NULL;
+			HASH_ITER(hh, current->datahash, tag_current, tag_tmp)
+			{
+				_STARPU_MSG("Unexpected message with comm %ld source %d tag %ld\n", (long int)current->node.comm, current->node.rank, tag_current->data_tag);
+			}
 		}
 		STARPU_ASSERT_MSG(_starpu_mpi_early_data_handle_hashmap_count == 0, "Number of unexpected received messages left is not 0 (but %d), did you forget to post a receive corresponding to a send?", _starpu_mpi_early_data_handle_hashmap_count);
 	}
@@ -60,7 +66,15 @@ void _starpu_mpi_early_data_shutdown(void)
 	struct _starpu_mpi_early_data_handle_hashlist *current=NULL, *tmp=NULL;
 	HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap, current, tmp)
 	{
-		STARPU_ASSERT(_starpu_mpi_early_data_handle_list_empty(&current->list));
+		_STARPU_MPI_DEBUG(600, "Hash early_data with comm %ld source %d\n", (long int) current->node.comm, current->node.rank);
+		struct _starpu_mpi_early_data_handle_tag_hashlist *tag_entry=NULL, *tag_tmp=NULL;
+		HASH_ITER(hh, current->datahash, tag_entry, tag_tmp)
+		{
+			_STARPU_MPI_DEBUG(600, "Hash 2nd level with tag %ld\n", tag_entry->data_tag);
+			STARPU_ASSERT(_starpu_mpi_early_data_handle_list_empty(&tag_entry->list));
+			HASH_DEL(current->datahash, tag_entry);
+			free(tag_entry);
+		}
 		HASH_DEL(_starpu_mpi_early_data_handle_hashmap, current);
 		free(current);
 	}
@@ -74,8 +88,8 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_create(struct _star
 	STARPU_PTHREAD_MUTEX_INIT(&early_data_handle->req_mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&early_data_handle->req_cond, NULL);
 	early_data_handle->env = envelope;
-	early_data_handle->node_tag.comm = comm;
-	early_data_handle->node_tag.rank = source;
+	early_data_handle->node_tag.node.comm = comm;
+	early_data_handle->node_tag.node.rank = source;
 	early_data_handle->node_tag.data_tag = envelope->data_tag;
 	return early_data_handle;
 }
@@ -86,45 +100,87 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu
 	struct _starpu_mpi_early_data_handle *early_data_handle;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_data_handle_mutex);
-	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with comm %ld source %d tag %ld\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
-	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
+	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with comm %ld source %d tag %ld\n", (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag);
+	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, &node_tag->node, sizeof(struct _starpu_mpi_node), hashlist);
 	if (hashlist == NULL)
 	{
+		_STARPU_MPI_DEBUG(600, "No entry for (comm %ld, source %d)\n", (long int)node_tag->node.comm, node_tag->node.rank);
 		early_data_handle = NULL;
 	}
 	else
 	{
-		if (_starpu_mpi_early_data_handle_list_empty(&hashlist->list))
+		struct _starpu_mpi_early_data_handle_tag_hashlist *tag_hashlist;
+		HASH_FIND(hh, hashlist->datahash, &node_tag->data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+		if (tag_hashlist == NULL)
+		{
+			_STARPU_MPI_DEBUG(600, "No entry for tag %ld\n", node_tag->data_tag);
+			early_data_handle = NULL;
+		}
+		else if (_starpu_mpi_early_data_handle_list_empty(&tag_hashlist->list))
 		{
+			_STARPU_MPI_DEBUG(600, "List empty for tag %ld\n", node_tag->data_tag);
 			early_data_handle = NULL;
 		}
 		else
 		{
 			_starpu_mpi_early_data_handle_hashmap_count --;
-			early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(&hashlist->list);
+			early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(&tag_hashlist->list);
 		}
 	}
-	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with comm %ld source %d tag %ld\n", early_data_handle, (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
+	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with comm %ld source %d tag %ld\n", early_data_handle, (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_data_handle_mutex);
 	return early_data_handle;
 }
 
+struct _starpu_mpi_early_data_handle_tag_hashlist *_starpu_mpi_early_data_extract(struct _starpu_mpi_node_tag *node_tag)
+{
+	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
+	struct _starpu_mpi_early_data_handle_tag_hashlist *tag_hashlist = NULL;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_data_handle_mutex);
+	_STARPU_MPI_DEBUG(60, "Looking for hashlist for (comm %ld, source %d)\n", (long int)node_tag->node.comm, node_tag->node.rank);
+	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, &node_tag->node, sizeof(struct _starpu_mpi_node), hashlist);
+	if (hashlist)
+	{
+		_STARPU_MPI_DEBUG(60, "Looking for hashlist for (tag %ld)\n", node_tag->data_tag);
+		HASH_FIND(hh, hashlist->datahash, &node_tag->data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+		if (tag_hashlist)
+		{
+			_starpu_mpi_early_data_handle_hashmap_count -= _starpu_mpi_early_data_handle_list_size(&tag_hashlist->list);
+			HASH_DEL(hashlist->datahash, tag_hashlist);
+		}
+	}
+	_STARPU_MPI_DEBUG(60, "Found hashlist %p for (comm %ld, source %d) and (tag %ld)\n", tag_hashlist, (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_data_handle_mutex);
+	return tag_hashlist;
+}
+
 void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_data_handle_mutex);
-	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with comm %ld source %d tag %ld\n", early_data_handle, (long int)early_data_handle->node_tag.comm,
-			  early_data_handle->node_tag.rank, early_data_handle->node_tag.data_tag);
+	_STARPU_MPI_DEBUG(60, "Adding early_data_handle %p with comm %ld source %d tag %ld (%p)\n", early_data_handle, (long int)early_data_handle->node_tag.node.comm, early_data_handle->node_tag.node.rank, early_data_handle->node_tag.data_tag, &early_data_handle->node_tag.node);
 
 	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
-	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, &early_data_handle->node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
+	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, &early_data_handle->node_tag.node, sizeof(struct _starpu_mpi_node), hashlist);
 	if (hashlist == NULL)
 	{
 		_STARPU_MPI_MALLOC(hashlist, sizeof(struct _starpu_mpi_early_data_handle_hashlist));
-		_starpu_mpi_early_data_handle_list_init(&hashlist->list);
-		hashlist->node_tag = early_data_handle->node_tag;
-		HASH_ADD(hh, _starpu_mpi_early_data_handle_hashmap, node_tag, sizeof(hashlist->node_tag), hashlist);
+		hashlist->node = early_data_handle->node_tag.node;
+		hashlist->datahash = NULL;
+		HASH_ADD(hh, _starpu_mpi_early_data_handle_hashmap, node, sizeof(hashlist->node), hashlist);
 	}
-	_starpu_mpi_early_data_handle_list_push_back(&hashlist->list, early_data_handle);
+
+	struct _starpu_mpi_early_data_handle_tag_hashlist *tag_hashlist;
+	HASH_FIND(hh, hashlist->datahash, &early_data_handle->node_tag.data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+	if (tag_hashlist == NULL)
+	{
+		_STARPU_MPI_MALLOC(tag_hashlist, sizeof(struct _starpu_mpi_early_data_handle_tag_hashlist));
+		tag_hashlist->data_tag = early_data_handle->node_tag.data_tag;
+		HASH_ADD(hh, hashlist->datahash, data_tag, sizeof(tag_hashlist->data_tag), tag_hashlist);
+		_starpu_mpi_early_data_handle_list_init(&tag_hashlist->list);
+	}
+
+	_starpu_mpi_early_data_handle_list_push_back(&tag_hashlist->list, early_data_handle);
 	_starpu_mpi_early_data_handle_hashmap_count ++;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_data_handle_mutex);
 }

+ 11 - 1
mpi/src/mpi/starpu_mpi_early_data.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2009-2014,2016                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
 #include <mpi.h>
 #include <common/config.h>
 #include <common/list.h>
+#include <common/uthash.h>
 #include <starpu_mpi_private.h>
 
 #ifdef STARPU_USE_MPI_MPI
@@ -43,6 +44,13 @@ LIST_TYPE(_starpu_mpi_early_data_handle,
 	  starpu_pthread_cond_t req_cond;
 );
 
+struct _starpu_mpi_early_data_handle_tag_hashlist
+{
+	struct _starpu_mpi_early_data_handle_list list;
+	UT_hash_handle hh;
+	starpu_mpi_tag_t data_tag;
+};
+
 void _starpu_mpi_early_data_init(void);
 void _starpu_mpi_early_data_check_termination(void);
 void _starpu_mpi_early_data_shutdown(void);
@@ -51,6 +59,8 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_create(struct _star
 struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu_mpi_node_tag *node_tag);
 void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle);
 
+struct _starpu_mpi_early_data_handle_tag_hashlist *_starpu_mpi_early_data_extract(struct _starpu_mpi_node_tag *node_tag);
+
 #ifdef __cplusplus
 }
 #endif

+ 69 - 16
mpi/src/mpi/starpu_mpi_early_request.c

@@ -24,11 +24,12 @@
 #ifdef STARPU_USE_MPI_MPI
 
 /** stores application requests for which data have not been received yet */
+/** the hashlist is on 2 levels, the first top level is indexed on (node, rank), the second lower level is indexed on the data tag */
 struct _starpu_mpi_early_request_hashlist
 {
-	struct _starpu_mpi_req_list list;
+	struct _starpu_mpi_early_request_tag_hashlist *datahash;
 	UT_hash_handle hh;
-	struct _starpu_mpi_node_tag node_tag;
+	struct _starpu_mpi_node node;
 };
 
 static starpu_pthread_mutex_t _starpu_mpi_early_request_mutex;
@@ -47,7 +48,14 @@ void _starpu_mpi_early_request_shutdown()
 	struct _starpu_mpi_early_request_hashlist *entry=NULL, *tmp=NULL;
 	HASH_ITER(hh, _starpu_mpi_early_request_hash, entry, tmp)
 	{
-		STARPU_ASSERT(_starpu_mpi_req_list_empty(&entry->list));
+		struct _starpu_mpi_early_request_tag_hashlist *tag_entry=NULL, *tag_tmp=NULL;
+		HASH_ITER(hh, entry->datahash, tag_entry, tag_tmp)
+		{
+			STARPU_ASSERT(_starpu_mpi_req_list_empty(&tag_entry->list));
+			HASH_DEL(entry->datahash, tag_entry);
+			free(tag_entry);
+		}
+
 		HASH_DEL(_starpu_mpi_early_request_hash, entry);
 		free(entry);
 	}
@@ -71,49 +79,94 @@ struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(starpu_mpi_tag_t data_
 	struct _starpu_mpi_early_request_hashlist *hashlist;
 
 	memset(&node_tag, 0, sizeof(struct _starpu_mpi_node_tag));
-	node_tag.comm = comm;
-	node_tag.rank = source;
+	node_tag.node.comm = comm;
+	node_tag.node.rank = source;
 	node_tag.data_tag = data_tag;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_request_mutex);
-	_STARPU_MPI_DEBUG(100, "Looking for early_request with comm %ld source %d tag %ld\n", (long int)node_tag.comm, node_tag.rank, node_tag.data_tag);
-	HASH_FIND(hh, _starpu_mpi_early_request_hash, &node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
+	_STARPU_MPI_DEBUG(100, "Looking for early_request with comm %ld source %d tag %ld\n", (long int)node_tag.node.comm, node_tag.node.rank, node_tag.data_tag);
+	HASH_FIND(hh, _starpu_mpi_early_request_hash, &node_tag.node, sizeof(struct _starpu_mpi_node), hashlist);
 	if (hashlist == NULL)
 	{
 		found = NULL;
 	}
 	else
 	{
-		if (_starpu_mpi_req_list_empty(&hashlist->list))
+		struct _starpu_mpi_early_request_tag_hashlist *tag_hashlist;
+		HASH_FIND(hh, hashlist->datahash, &node_tag.data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+		if (tag_hashlist == NULL)
+		{
+			found = NULL;
+		}
+		else if (_starpu_mpi_req_list_empty(&tag_hashlist->list))
 		{
 			found = NULL;
 		}
 		else
 		{
-			found = _starpu_mpi_req_list_pop_front(&hashlist->list);
+			found = _starpu_mpi_req_list_pop_front(&tag_hashlist->list);
 			_starpu_mpi_early_request_hash_count --;
 		}
 	}
-	_STARPU_MPI_DEBUG(100, "Found early_request %p with comm %ld source %d tag %ld\n", found, (long int)node_tag.comm, node_tag.rank, node_tag.data_tag);
+	_STARPU_MPI_DEBUG(100, "Found early_request %p with comm %ld source %d tag %ld\n", found, (long int)node_tag.node.comm, node_tag.node.rank, node_tag.data_tag);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_request_mutex);
 	return found;
 }
 
+struct _starpu_mpi_early_request_tag_hashlist *_starpu_mpi_early_request_extract(starpu_mpi_tag_t data_tag, int source, MPI_Comm comm)
+{
+	struct _starpu_mpi_node_tag node_tag;
+	struct _starpu_mpi_early_request_hashlist *hashlist;
+	struct _starpu_mpi_early_request_tag_hashlist *tag_hashlist = NULL;
+
+	memset(&node_tag, 0, sizeof(struct _starpu_mpi_node_tag));
+	node_tag.node.comm = comm;
+	node_tag.node.rank = source;
+	node_tag.data_tag = data_tag;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_request_mutex);
+	_STARPU_MPI_DEBUG(100, "Looking for early_request with comm %ld source %d tag %ld\n", (long int)node_tag.node.comm, node_tag.node.rank, node_tag.data_tag);
+	HASH_FIND(hh, _starpu_mpi_early_request_hash, &node_tag.node, sizeof(struct _starpu_mpi_node), hashlist);
+	if (hashlist)
+	{
+		HASH_FIND(hh, hashlist->datahash, &node_tag.data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+		if (tag_hashlist)
+		{
+			_starpu_mpi_early_request_hash_count -= _starpu_mpi_req_list_size(&tag_hashlist->list);
+			HASH_DEL(hashlist->datahash, tag_hashlist);
+		}
+	}
+	_STARPU_MPI_DEBUG(100, "Found hashlist %p with comm %ld source %d tag %ld\n", hashlist, (long int)node_tag.node.comm, node_tag.node.rank, node_tag.data_tag);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_request_mutex);
+	return tag_hashlist;
+}
+
 void _starpu_mpi_early_request_enqueue(struct _starpu_mpi_req *req)
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_request_mutex);
-	_STARPU_MPI_DEBUG(100, "Adding request %p with comm %ld source %d tag %ld in the application request hashmap\n", req, (long int)req->node_tag.comm, req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_DEBUG(100, "Adding request %p with comm %ld source %d tag %ld in the application request hashmap\n", req, (long int)req->node_tag.node.comm, req->node_tag.node.rank, req->node_tag.data_tag);
 
 	struct _starpu_mpi_early_request_hashlist *hashlist;
-	HASH_FIND(hh, _starpu_mpi_early_request_hash, &req->node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
+	HASH_FIND(hh, _starpu_mpi_early_request_hash, &req->node_tag.node, sizeof(struct _starpu_mpi_node), hashlist);
 	if (hashlist == NULL)
 	{
 		_STARPU_MPI_MALLOC(hashlist, sizeof(struct _starpu_mpi_early_request_hashlist));
-		_starpu_mpi_req_list_init(&hashlist->list);
-		hashlist->node_tag = req->node_tag;
-		HASH_ADD(hh, _starpu_mpi_early_request_hash, node_tag, sizeof(hashlist->node_tag), hashlist);
+		hashlist->node = req->node_tag.node;
+		hashlist->datahash = NULL;
+		HASH_ADD(hh, _starpu_mpi_early_request_hash, node, sizeof(hashlist->node), hashlist);
+	}
+
+	struct _starpu_mpi_early_request_tag_hashlist *tag_hashlist;
+	HASH_FIND(hh, hashlist->datahash, &req->node_tag.data_tag, sizeof(starpu_mpi_tag_t), tag_hashlist);
+	if (tag_hashlist == NULL)
+	{
+		_STARPU_MPI_MALLOC(tag_hashlist, sizeof(struct _starpu_mpi_early_request_tag_hashlist));
+		tag_hashlist->data_tag = req->node_tag.data_tag;
+		HASH_ADD(hh, hashlist->datahash, data_tag, sizeof(tag_hashlist->data_tag), tag_hashlist);
+		_starpu_mpi_req_list_init(&tag_hashlist->list);
 	}
-	_starpu_mpi_req_list_push_back(&hashlist->list, req);
+
+	_starpu_mpi_req_list_push_back(&tag_hashlist->list, req);
 	_starpu_mpi_early_request_hash_count ++;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_request_mutex);
 }

+ 10 - 1
mpi/src/mpi/starpu_mpi_early_request.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2009-2014                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,6 +31,13 @@ extern "C"
 {
 #endif
 
+struct _starpu_mpi_early_request_tag_hashlist
+{
+	struct _starpu_mpi_req_list list;
+	UT_hash_handle hh;
+	starpu_mpi_tag_t data_tag;
+};
+
 void _starpu_mpi_early_request_init(void);
 void _starpu_mpi_early_request_shutdown(void);
 int _starpu_mpi_early_request_count(void);
@@ -39,6 +46,8 @@ void _starpu_mpi_early_request_check_termination(void);
 void _starpu_mpi_early_request_enqueue(struct _starpu_mpi_req *req);
 struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(starpu_mpi_tag_t data_tag, int source, MPI_Comm comm);
 
+struct _starpu_mpi_early_request_tag_hashlist *_starpu_mpi_early_request_extract(starpu_mpi_tag_t data_tag, int source, MPI_Comm comm);
+
 #ifdef __cplusplus
 }
 #endif

+ 150 - 149
mpi/src/mpi/starpu_mpi_mpi.c

@@ -152,7 +152,7 @@ void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, in
 	{
 		if (coop_sends->reqs_array[i]->request_type == SEND_REQ && submit_data)
 		{
-			_STARPU_MPI_DEBUG(0, "cooperative sends %p sending to %d\n", coop_sends, coop_sends->reqs_array[i]->node_tag.rank);
+			_STARPU_MPI_DEBUG(0, "cooperative sends %p sending to %d\n", coop_sends, coop_sends->reqs_array[i]->node_tag.node.rank);
 			_starpu_mpi_submit_ready_request(coop_sends->reqs_array[i]);
 		}
 		/* TODO: handle redirect requests */
@@ -166,7 +166,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
-	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64"d and type %s %d\n", req, req->node_tag.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->is_internal_req);
+	_STARPU_MPI_DEBUG(0, "new req %p srcdst %d tag %"PRIi64" and type %s %d\n", req, req->node_tag.node.rank, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->backend->is_internal_req);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
@@ -178,7 +178,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 		 * pointer associated to the data_handle, and push it into the
 		 * ready_requests list, so as the real MPI request can be submitted
 		 * before the next submission of the envelope-catching request. */
-		if (req->is_internal_req)
+		if (req->backend->is_internal_req)
 		{
 			_starpu_mpi_datatype_allocate(req->data_handle, req);
 			if (req->registered_datatype == 1)
@@ -192,18 +192,18 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				_STARPU_MPI_MALLOC(req->ptr, req->count);
 			}
 
-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 					  req->datatype_name, (int)req->count, req->registered_datatype);
 			_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 			_STARPU_MPI_INC_READY_REQUESTS(+1);
 
 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
+			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->posted_mutex);
 			req->posted = 1;
-			STARPU_PTHREAD_COND_BROADCAST(&req->posted_cond);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&req->posted_mutex);
+			STARPU_PTHREAD_COND_BROADCAST(&req->backend->posted_cond);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->posted_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 		}
 		else
@@ -224,11 +224,11 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req_mutex));
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
-				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64"d has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
+				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %"PRIi64" has already been received, copying previously received data into handle's pointer..\n", req, req->node_tag.data_tag);
 				STARPU_ASSERT(req->data_handle != early_data_handle->handle);
 
-				req->internal_req = early_data_handle->req;
-				req->early_data_handle = early_data_handle;
+				req->backend->internal_req = early_data_handle->req;
+				req->backend->early_data_handle = early_data_handle;
 
 				struct _starpu_mpi_early_data_cb_args *cb_args;
 				_STARPU_MPI_MALLOC(cb_args, sizeof(struct _starpu_mpi_early_data_cb_args));
@@ -245,8 +245,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			{
-				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.rank, req->node_tag.comm);
-				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64"d and src %d = %p\n", req->node_tag.data_tag, req->node_tag.rank, sync_req);
+				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
+				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				{
 					req->sync = 1;
@@ -268,7 +268,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				}
 				else
 				{
-					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64"d) into the request hashmap\n", req, req->node_tag.rank, req->node_tag.data_tag);
+					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_starpu_mpi_early_request_enqueue(req);
 				}
 			}
@@ -281,8 +281,8 @@ void _starpu_mpi_submit_ready_request(void *arg)
 		else
 			_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 		_STARPU_MPI_INC_READY_REQUESTS(+1);
-		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-				  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
+		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+				  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 				  req->datatype_name, (int)req->count, req->registered_datatype);
 	}
 
@@ -359,36 +359,36 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(0, "post MPI isend request %p type %s tag %"PRIi64"d src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
+	_STARPU_MPI_DEBUG(0, "post MPI isend request %p type %s tag %"PRIi64" src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
 
-	_starpu_mpi_comm_amounts_inc(req->node_tag.comm, req->node_tag.rank, req->datatype, req->count);
+	_starpu_mpi_comm_amounts_inc(req->node_tag.node.comm, req->node_tag.node.rank, req->datatype, req->count);
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag, 0);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag, 0);
 
 	if (req->sync == 0)
 	{
-		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
-		req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.node.comm);
+		req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.node.comm, &req->backend->data_request);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 	else
 	{
-		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
-		req->ret = MPI_Issend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.node.comm);
+		req->ret = MPI_Issend(req->ptr, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.node.comm, &req->backend->data_request);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 
 #ifdef STARPU_SIMGRID
-	_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
+	_starpu_mpi_simgrid_wait_req(&req->backend->data_request, &req->status_store, &req->queue, &req->done);
 #endif
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	req->submitted = 1;
-	STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	STARPU_PTHREAD_COND_BROADCAST(&req->backend->req_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 
 	_starpu_mpi_handle_detached_request(req);
 
@@ -399,54 +399,55 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 {
 	_starpu_mpi_datatype_allocate(req->data_handle, req);
 
-	_STARPU_MPI_CALLOC(req->envelope, 1,sizeof(struct _starpu_mpi_envelope));
-	req->envelope->mode = _STARPU_MPI_ENVELOPE_DATA;
-	req->envelope->data_tag = req->node_tag.data_tag;
-	req->envelope->sync = req->sync;
+	_STARPU_MPI_CALLOC(req->backend->envelope, 1,sizeof(struct _starpu_mpi_envelope));
+	req->backend->envelope->mode = _STARPU_MPI_ENVELOPE_DATA;
+	req->backend->envelope->data_tag = req->node_tag.data_tag;
+	req->backend->envelope->sync = req->sync;
 
 	if (req->registered_datatype == 1)
 	{
-		int size;
+		int size, ret;
 		req->count = 1;
 		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
 
 		MPI_Type_size(req->datatype, &size);
-		req->envelope->size = (starpu_ssize_t)req->count * size;
-		_STARPU_MPI_DEBUG(20, "Post MPI isend count (%ld) datatype_size %ld request to %d\n",req->count,starpu_data_get_size(req->data_handle), req->node_tag.rank);
-		_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
-		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
+		req->backend->envelope->size = (starpu_ssize_t)req->count * size;
+		_STARPU_MPI_DEBUG(20, "Post MPI isend count (%ld) datatype_size %ld request to %d\n",req->count,starpu_data_get_size(req->data_handle), req->node_tag.node.rank);
+		_STARPU_MPI_COMM_TO_DEBUG(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->backend->envelope->data_tag, req->node_tag.node.comm);
+		ret = MPI_Isend(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.node.comm, &req->backend->size_req);
+		STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending envelope, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
 	}
 	else
 	{
 		int ret;
 
  		// Do not pack the data, just try to find out the size
-		starpu_data_pack(req->data_handle, NULL, &(req->envelope->size));
+		starpu_data_pack(req->data_handle, NULL, &(req->backend->envelope->size));
 
-		if (req->envelope->size != -1)
+		if (req->backend->envelope->size != -1)
  		{
  			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
-			req->count = req->envelope->size;
-			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
-			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
+			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", req->backend->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.node.rank);
+			req->count = req->backend->envelope->size;
+			_STARPU_MPI_COMM_TO_DEBUG(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->backend->envelope->data_tag, req->node_tag.node.comm);
+			ret = MPI_Isend(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.node.comm, &req->backend->size_req);
 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
 
  		// Pack the data
  		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
-		if (req->envelope->size == -1)
+		if (req->backend->envelope->size == -1)
  		{
  			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
-			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
-			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
+			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (second call to pack)\n", req->backend->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.node.rank);
+			_STARPU_MPI_COMM_TO_DEBUG(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->backend->envelope->data_tag, req->node_tag.node.comm);
+			ret = MPI_Isend(req->backend->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.node.comm, &req->backend->size_req);
 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
  		else
  		{
  			// We check the size returned with the 2 calls to pack is the same
-			STARPU_MPI_ASSERT_MSG(req->count == req->envelope->size, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->size);
+			STARPU_MPI_ASSERT_MSG(req->count == req->backend->envelope->size, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->backend->envelope->size);
  		}
 		// We can send the data now
 	}
@@ -473,9 +474,9 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(0, "post MPI irecv request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+	_STARPU_MPI_DEBUG(0, "post MPI irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
-	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	if (req->sync)
 	{
@@ -483,9 +484,9 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 		_STARPU_MPI_CALLOC(_envelope, 1, sizeof(struct _starpu_mpi_envelope));
 		_envelope->mode = _STARPU_MPI_ENVELOPE_SYNC_READY;
 		_envelope->data_tag = req->node_tag.data_tag;
-		_STARPU_MPI_DEBUG(20, "Telling node %d it can send the data and waiting for the data back ...\n", req->node_tag.rank);
-		_STARPU_MPI_COMM_TO_DEBUG(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _envelope->data_tag, req->node_tag.comm);
-		req->ret = MPI_Send(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
+		_STARPU_MPI_DEBUG(20, "Telling node %d it can send the data and waiting for the data back ...\n", req->node_tag.node.rank);
+		_STARPU_MPI_COMM_TO_DEBUG(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, _envelope->data_tag, req->node_tag.node.comm);
+		req->ret = MPI_Send(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.node.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.node.comm);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Send returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 		free(_envelope);
 		_envelope = NULL;
@@ -493,26 +494,26 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 
 	if (req->sync)
 	{
-		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
-		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.node.comm);
+		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.node.comm, &req->backend->data_request);
 	}
 	else
 	{
-		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
-		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.node.comm);
+		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.node.rank, _STARPU_MPI_TAG_DATA, req->node_tag.node.comm, &req->backend->data_request);
 #ifdef STARPU_SIMGRID
-		_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
+		_starpu_mpi_simgrid_wait_req(&req->backend->data_request, &req->status_store, &req->queue, &req->done);
 #endif
 	}
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
-	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	req->submitted = 1;
-	STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	STARPU_PTHREAD_COND_BROADCAST(&req->backend->req_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 
 	_starpu_mpi_handle_detached_request(req);
 
@@ -529,19 +530,19 @@ void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
 	_STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are waiting for ? */
-	struct _starpu_mpi_req *req = waiting_req->other_request;
+	struct _starpu_mpi_req *req = waiting_req->backend->other_request;
 
-	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
-	if (req->data_request != MPI_REQUEST_NULL)
+	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
+	if (req->backend->data_request != MPI_REQUEST_NULL)
 	{
 		// TODO: Fix for STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 		STARPU_MPI_ASSERT_MSG(0, "Implement this in STARPU_SIMGRID");
 #endif
-		req->ret = MPI_Wait(&req->data_request, waiting_req->status);
+		req->ret = MPI_Wait(&req->backend->data_request, waiting_req->status);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
-	_STARPU_MPI_TRACE_UWAIT_END(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_UWAIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	_starpu_mpi_handle_request_termination(req);
 
@@ -558,34 +559,34 @@ int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
-	STARPU_PTHREAD_MUTEX_LOCK(&(req->req_mutex));
+	STARPU_PTHREAD_MUTEX_LOCK(&(req->backend->req_mutex));
 	while (!(req->submitted))
-		STARPU_PTHREAD_COND_WAIT(&(req->req_cond), &(req->req_mutex));
-	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
+		STARPU_PTHREAD_COND_WAIT(&(req->backend->req_cond), &(req->backend->req_mutex));
+	STARPU_PTHREAD_MUTEX_UNLOCK(&(req->backend->req_mutex));
 
 	/* Initialize the request structure */
 	 _starpu_mpi_request_init(&waiting_req);
 	waiting_req->prio = INT_MAX;
 	waiting_req->status = status;
-	waiting_req->other_request = req;
+	waiting_req->backend->other_request = req;
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 
 	_starpu_mpi_submit_ready_request_inc(waiting_req);
 
 	/* We wait for the MPI request to finish */
-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	while (!req->completed)
-		STARPU_PTHREAD_COND_WAIT(&req->req_cond, &req->req_mutex);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+		STARPU_PTHREAD_COND_WAIT(&req->backend->req_cond, &req->backend->req_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 
 	ret = req->ret;
 
 	/* The internal request structure was automatically allocated */
 	*public_req = NULL;
-	if (req->internal_req)
+	if (req->backend->internal_req)
 	{
-		_starpu_mpi_request_destroy(req->internal_req);
+		_starpu_mpi_request_destroy(req->backend->internal_req);
 	}
 	_starpu_mpi_request_destroy(req);
 	_starpu_mpi_request_destroy(waiting_req);
@@ -604,24 +605,24 @@ void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
 	_STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are testing for ? */
-	struct _starpu_mpi_req *req = testing_req->other_request;
+	struct _starpu_mpi_req *req = testing_req->backend->other_request;
 
-	_STARPU_MPI_DEBUG(0, "Test request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
+	_STARPU_MPI_DEBUG(0, "Test request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 			  req->datatype_name, (int)req->count, req->registered_datatype);
 
-	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 #ifdef STARPU_SIMGRID
 	req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, testing_req->flag);
 	memcpy(testing_req->status, &req->status_store, sizeof(*testing_req->status));
 #else
-	req->ret = MPI_Test(&req->data_request, testing_req->flag, testing_req->status);
+	req->ret = MPI_Test(&req->backend->data_request, testing_req->flag, testing_req->status);
 #endif
 
 	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
-	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	if (*testing_req->flag)
 	{
@@ -629,10 +630,10 @@ void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 		_starpu_mpi_handle_request_termination(req);
 	}
 
-	STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&testing_req->backend->req_mutex);
 	testing_req->completed = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
+	STARPU_PTHREAD_COND_SIGNAL(&testing_req->backend->req_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->backend->req_mutex);
 	_STARPU_MPI_LOG_OUT();
 }
 
@@ -647,9 +648,9 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 
-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	unsigned submitted = req->submitted;
-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 
 	if (submitted)
 	{
@@ -660,7 +661,7 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->prio = INT_MAX;
 		testing_req->flag = flag;
 		testing_req->status = status;
-		testing_req->other_request = req;
+		testing_req->backend->other_request = req;
 		testing_req->func = _starpu_mpi_test_func;
 		testing_req->completed = 0;
 		testing_req->request_type = TEST_REQ;
@@ -668,10 +669,10 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		_starpu_mpi_submit_ready_request_inc(testing_req);
 
 		/* We wait for the test request to finish */
-		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
+		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->backend->req_mutex));
 		while (!(testing_req->completed))
-			STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
-		STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
+			STARPU_PTHREAD_COND_WAIT(&(testing_req->backend->req_cond), &(testing_req->backend->req_mutex));
+		STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->backend->req_mutex));
 
 		ret = testing_req->ret;
 
@@ -681,9 +682,9 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 			 * request structure which was automatically allocated
 			 * */
 			*public_req = NULL;
-			if (req->internal_req)
+			if (req->backend->internal_req)
 			{
-				_starpu_mpi_request_destroy(req->internal_req);
+				_starpu_mpi_request_destroy(req->backend->internal_req);
 			}
 			_starpu_mpi_request_destroy(req);
 		}
@@ -709,7 +710,7 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	barrier_req->ret = MPI_Barrier(barrier_req->node_tag.comm);
+	barrier_req->ret = MPI_Barrier(barrier_req->node_tag.node.comm);
 	STARPU_MPI_ASSERT_MSG(barrier_req->ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(barrier_req->ret));
 
 	_starpu_mpi_handle_request_termination(barrier_req);
@@ -752,16 +753,16 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 	barrier_req->prio = INT_MAX;
 	barrier_req->func = _starpu_mpi_barrier_func;
 	barrier_req->request_type = BARRIER_REQ;
-	barrier_req->node_tag.comm = comm;
+	barrier_req->node_tag.node.comm = comm;
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 	_starpu_mpi_submit_ready_request(barrier_req);
 
 	/* We wait for the MPI request to finish */
-	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
 	while (!barrier_req->completed)
-		STARPU_PTHREAD_COND_WAIT(&barrier_req->req_cond, &barrier_req->req_mutex);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->req_mutex);
+		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
 
 	_starpu_mpi_request_destroy(barrier_req);
 	_STARPU_MPI_LOG_OUT();
@@ -795,14 +796,14 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d internal_req %p\n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
-			  req->datatype_name, (int)req->count, req->registered_datatype, req->internal_req);
+	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d internal_req %p\n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
+			  req->datatype_name, (int)req->count, req->registered_datatype, req->backend->internal_req);
 
-	if (req->internal_req)
+	if (req->backend->internal_req)
 	{
-		free(req->early_data_handle);
-		req->early_data_handle = NULL;
+		free(req->backend->early_data_handle);
+		req->backend->early_data_handle = NULL;
 	}
 	else
 	{
@@ -816,7 +817,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 					// has completed, as MPI can re-order messages, let's call
 					// MPI_Wait to make sure data have been sent
 					int ret;
-					ret = MPI_Wait(&req->size_req, MPI_STATUS_IGNORE);
+					ret = MPI_Wait(&req->backend->size_req, MPI_STATUS_IGNORE);
 					STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
 					free(req->ptr);
 					req->ptr = NULL;
@@ -833,15 +834,15 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 				_starpu_mpi_datatype_free(req->data_handle, &req->datatype);
 			}
 		}
-		_STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.rank, req->node_tag.data_tag);
+		_STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.node.rank, req->node_tag.data_tag);
 	}
 
 	_starpu_mpi_release_req_data(req);
 
-	if (req->envelope)
+	if (req->backend->envelope)
 	{
-		free(req->envelope);
-		req->envelope = NULL;
+		free(req->backend->envelope);
+		req->backend->envelope = NULL;
 	}
 
 	/* Execute the specified callback, if any */
@@ -850,10 +851,10 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
 	/* tell anyone potentially waiting on the request that it is
 	 * terminated now */
-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 	req->completed = 1;
-	STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	STARPU_PTHREAD_COND_BROADCAST(&req->backend->req_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 	_STARPU_MPI_LOG_OUT();
 }
 
@@ -902,18 +903,18 @@ static void _starpu_mpi_early_data_cb(void* arg)
 		if (args->req->detached)
 		{
 			/* have the internal request destroyed now or when completed */
-			STARPU_PTHREAD_MUTEX_LOCK(&args->req->internal_req->req_mutex);
-			if (args->req->internal_req->to_destroy)
+			STARPU_PTHREAD_MUTEX_LOCK(&args->req->backend->internal_req->backend->req_mutex);
+			if (args->req->backend->internal_req->backend->to_destroy)
 			{
 				/* The request completed first, can now destroy it */
-				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->internal_req->req_mutex);
-				_starpu_mpi_request_destroy(args->req->internal_req);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->backend->internal_req->backend->req_mutex);
+				_starpu_mpi_request_destroy(args->req->backend->internal_req);
 			}
 			else
 			{
 				/* The request didn't complete yet, tell it to destroy it when it completes */
-				args->req->internal_req->to_destroy = 1;
-				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->internal_req->req_mutex);
+				args->req->backend->internal_req->backend->to_destroy = 1;
+				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->backend->internal_req->backend->req_mutex);
 			}
 			_starpu_mpi_handle_request_termination(args->req);
 			_starpu_mpi_request_destroy(args->req);
@@ -924,11 +925,11 @@ static void _starpu_mpi_early_data_cb(void* arg)
 			// be handled when calling starpu_mpi_wait
 			// We store in the application request the internal MPI
 			// request so that it can be used by starpu_mpi_wait
-			args->req->data_request = args->req->internal_req->data_request;
-			STARPU_PTHREAD_MUTEX_LOCK(&args->req->req_mutex);
+			args->req->backend->data_request = args->req->backend->internal_req->backend->data_request;
+			STARPU_PTHREAD_MUTEX_LOCK(&args->req->backend->req_mutex);
 			args->req->submitted = 1;
-			STARPU_PTHREAD_COND_BROADCAST(&args->req->req_cond);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->req_mutex);
+			STARPU_PTHREAD_COND_BROADCAST(&args->req->backend->req_cond);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->backend->req_mutex);
 		}
 	}
 
@@ -957,17 +958,17 @@ static void _starpu_mpi_test_detached_requests(void)
 	{
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
-		_STARPU_MPI_TRACE_TEST_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
-		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %"PRIi64"d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
+		_STARPU_MPI_TRACE_TEST_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
+		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %"PRIi64" - TYPE %s %d\n", &req->backend->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.node.rank);
 #ifdef STARPU_SIMGRID
 		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
 #else
-		STARPU_MPI_ASSERT_MSG(req->data_request != MPI_REQUEST_NULL, "Cannot test completion of the request MPI_REQUEST_NULL");
-		req->ret = MPI_Test(&req->data_request, &flag, MPI_STATUS_IGNORE);
+		STARPU_MPI_ASSERT_MSG(req->backend->data_request != MPI_REQUEST_NULL, "Cannot test completion of the request MPI_REQUEST_NULL");
+		req->ret = MPI_Test(&req->backend->data_request, &flag, MPI_STATUS_IGNORE);
 #endif
 
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
-		_STARPU_MPI_TRACE_TEST_END(req->node_tag.rank, req->node_tag.data_tag);
+		_STARPU_MPI_TRACE_TEST_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 		if (!flag)
 		{
@@ -979,7 +980,7 @@ static void _starpu_mpi_test_detached_requests(void)
 		     	struct _starpu_mpi_req *next_req;
 			next_req = _starpu_mpi_req_list_next(req);
 
-			_STARPU_MPI_TRACE_COMPLETE_BEGIN(req->request_type, req->node_tag.rank, req->node_tag.data_tag);
+			_STARPU_MPI_TRACE_COMPLETE_BEGIN(req->request_type, req->node_tag.node.rank, req->node_tag.data_tag);
 
 			STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 			if (req->request_type == SEND_REQ)
@@ -988,21 +989,21 @@ static void _starpu_mpi_test_detached_requests(void)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 			_starpu_mpi_handle_request_termination(req);
 
-			_STARPU_MPI_TRACE_COMPLETE_END(req->request_type, req->node_tag.rank, req->node_tag.data_tag);
+			_STARPU_MPI_TRACE_COMPLETE_END(req->request_type, req->node_tag.node.rank, req->node_tag.data_tag);
 
-			STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+			STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
 			/* We don't want to free internal non-detached
 			   requests, we need to get their MPI request before
 			   destroying them */
-			if (req->is_internal_req && !req->to_destroy)
+			if (req->backend->is_internal_req && !req->backend->to_destroy)
 			{
 				/* We have completed the request, let the application request destroy it */
-				req->to_destroy = 1;
-				STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+				req->backend->to_destroy = 1;
+				STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 			}
 			else
 			{
-				STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 				_starpu_mpi_request_destroy(req);
 			}
 
@@ -1044,8 +1045,8 @@ static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 	STARPU_MPI_ASSERT_MSG(req, "Invalid request");
 
 	/* submit the request to MPI */
-	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %"PRIi64"d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle,
+	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle,
 			  req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 	req->func(req);
 
@@ -1054,7 +1055,7 @@ static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 
 static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope, MPI_Status status, MPI_Comm comm)
 {
-	_STARPU_MPI_DEBUG(20, "Request with tag %"PRIi64"d and source %d not found, creating a early_data_handle to receive incoming data..\n", envelope->data_tag, status.MPI_SOURCE);
+	_STARPU_MPI_DEBUG(20, "Request with tag %"PRIi64" and source %d not found, creating a early_data_handle to receive incoming data..\n", envelope->data_tag, status.MPI_SOURCE);
 	_STARPU_MPI_DEBUG(20, "Request sync %d\n", envelope->sync);
 
 	struct _starpu_mpi_early_data_handle* early_data_handle = _starpu_mpi_early_data_create(envelope, status.MPI_SOURCE, comm);
@@ -1084,7 +1085,7 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 		//_starpu_mpi_early_data_add(early_data_handle);
 	}
 
-	_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_data_handle with tag %"PRIi64"d from comm %ld src %d ..\n",
+	_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_data_handle with tag %"PRIi64" from comm %ld src %d ..\n",
 			  early_data_handle->node_tag.data_tag, (long int)comm, status.MPI_SOURCE);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
@@ -1095,10 +1096,10 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	// We wait until the request is pushed in the
 	// ready_request list
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
-	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
+	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->backend->posted_mutex));
 	while (!(early_data_handle->req->posted))
-		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
-	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
+		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->backend->posted_cond), &(early_data_handle->req->backend->posted_mutex));
+	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->backend->posted_mutex));
 
 #ifdef STARPU_DEVEL
 #warning check if req_ready is still necessary
@@ -1305,7 +1306,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 					struct _starpu_mpi_req *_sync_req = _starpu_mpi_sync_data_find(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
 					_STARPU_MPI_DEBUG(20, "Sending data with tag %"PRIi64" to node %d\n", _sync_req->node_tag.data_tag, envelope_status.MPI_SOURCE);
-					STARPU_MPI_ASSERT_MSG(envelope->data_tag == _sync_req->node_tag.data_tag, "Tag mismatch (envelope %"PRIi64"d != req %"PRIi64"d)\n",
+					STARPU_MPI_ASSERT_MSG(envelope->data_tag == _sync_req->node_tag.data_tag, "Tag mismatch (envelope %"PRIi64" != req %"PRIi64")\n",
 							      envelope->data_tag, _sync_req->node_tag.data_tag);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 					_starpu_mpi_isend_data_func(_sync_req);
@@ -1313,7 +1314,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				}
 				else
 				{
-					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64"d and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
+					_STARPU_MPI_DEBUG(3, "Searching for application request with tag %"PRIi64" and source %d (size %ld)\n", envelope->data_tag, envelope_status.MPI_SOURCE, envelope->size);
 
 					struct _starpu_mpi_req *early_request = _starpu_mpi_early_request_dequeue(envelope->data_tag, envelope_status.MPI_SOURCE, envelope_comm);
 
@@ -1326,7 +1327,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					{
 						if (envelope->sync)
 						{
-							_STARPU_MPI_DEBUG(2000, "-------------------------> adding request for tag %l"PRIi64"\n", envelope->data_tag);
+							_STARPU_MPI_DEBUG(2000, "-------------------------> adding request for tag %"PRIi64"\n", envelope->data_tag);
 							struct _starpu_mpi_req *new_req;
 #ifdef STARPU_DEVEL
 #warning creating a request is not really useful.
@@ -1335,16 +1336,16 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 							_starpu_mpi_request_init(&new_req);
 							new_req->request_type = RECV_REQ;
 							new_req->data_handle = NULL;
-							new_req->node_tag.rank = envelope_status.MPI_SOURCE;
+							new_req->node_tag.node.rank = envelope_status.MPI_SOURCE;
 							new_req->node_tag.data_tag = envelope->data_tag;
-							new_req->node_tag.comm = envelope_comm;
+							new_req->node_tag.node.comm = envelope_comm;
 							new_req->detached = 1;
 							new_req->sync = 1;
 							new_req->callback = NULL;
 							new_req->callback_arg = NULL;
 							new_req->func = _starpu_mpi_irecv_size_func;
 							new_req->sequential_consistency = 1;
-							new_req->is_internal_req = 0; // ????
+							new_req->backend->is_internal_req = 0; // ????
 							new_req->count = envelope->size;
 							_starpu_mpi_sync_data_add(new_req);
 						}
@@ -1360,7 +1361,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * _starpu_mpi_handle_ready_request. */
 					else
 					{
-						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"d\n", envelope->data_tag);
+						_STARPU_MPI_DEBUG(2000, "A matching application request has been found for the incoming data with tag %"PRIi64"\n", envelope->data_tag);
 						_STARPU_MPI_DEBUG(2000, "Request sync %d\n", envelope->sync);
 
 						early_request->sync = envelope->sync;
@@ -1572,7 +1573,7 @@ void _starpu_mpi_progress_shutdown(void **value)
 #ifdef STARPU_SIMGRID
 	/* FIXME: should rather properly wait for _starpu_mpi_progress_thread_func to finish */
 	(void) value;
-	MSG_process_sleep(1);
+	starpu_sleep(1);
 #else
 	STARPU_PTHREAD_JOIN(progress_thread, value);
 #endif

+ 117 - 0
mpi/src/mpi/starpu_mpi_mpi_backend.c

@@ -0,0 +1,117 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017                                     Inria
+ * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_config.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
+
+#ifdef STARPU_USE_MPI_MPI
+
+#include <mpi/starpu_mpi_mpi_backend.h>
+#include <mpi/starpu_mpi_tag.h>
+#include <mpi/starpu_mpi_comm.h>
+#include <mpi/starpu_mpi_comm.h>
+#include <mpi/starpu_mpi_tag.h>
+#include <mpi/starpu_mpi_driver.h>
+
+void _starpu_mpi_mpi_backend_init(struct starpu_conf *conf)
+{
+	_starpu_mpi_driver_init(conf);
+}
+
+void _starpu_mpi_mpi_backend_shutdown(void)
+{
+	_starpu_mpi_tag_shutdown();
+	_starpu_mpi_comm_shutdown();
+	_starpu_mpi_driver_shutdown();
+}
+
+int _starpu_mpi_mpi_backend_reserve_core(void)
+{
+	return (starpu_get_env_number_default("STARPU_MPI_DRIVER_CALL_FREQUENCY", 0) <= 0);
+}
+
+void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
+{
+	_STARPU_MPI_CALLOC(req->backend, 1, sizeof(struct _starpu_mpi_req_backend));
+
+	req->backend->data_request = 0;
+
+	STARPU_PTHREAD_MUTEX_INIT(&req->backend->req_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT(&req->backend->req_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&req->backend->posted_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT(&req->backend->posted_cond, NULL);
+
+	req->backend->other_request = NULL;
+
+	req->backend->size_req = 0;
+	req->backend->internal_req = NULL;
+	req->backend->is_internal_req = 0;
+	req->backend->to_destroy = 1;
+	req->backend->early_data_handle = NULL;
+	req->backend->envelope = NULL;
+}
+
+void _starpu_mpi_mpi_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req)
+{
+	_starpu_mpi_comm_register(comm);
+
+	req->backend->is_internal_req = is_internal_req;
+	/* For internal requests, we wait for both the request completion and the matching application request completion */
+	req->backend->to_destroy = !is_internal_req;
+}
+
+void _starpu_mpi_mpi_backend_request_destroy(struct _starpu_mpi_req *req)
+{
+	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->req_mutex);
+	STARPU_PTHREAD_COND_DESTROY(&req->backend->req_cond);
+	STARPU_PTHREAD_MUTEX_DESTROY(&req->backend->posted_mutex);
+	STARPU_PTHREAD_COND_DESTROY(&req->backend->posted_cond);
+	free(req->backend);
+}
+
+void _starpu_mpi_mpi_backend_data_clear(starpu_data_handle_t data_handle)
+{
+	_starpu_mpi_tag_data_release(data_handle);
+}
+
+void _starpu_mpi_mpi_backend_data_register(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag)
+{
+	_starpu_mpi_tag_data_register(data_handle, data_tag);
+}
+
+void _starpu_mpi_mpi_backend_comm_register(MPI_Comm comm)
+{
+	_starpu_mpi_comm_register(comm);
+}
+
+struct _starpu_mpi_backend _mpi_backend =
+{
+ 	._starpu_mpi_backend_init = _starpu_mpi_mpi_backend_init,
+ 	._starpu_mpi_backend_shutdown = _starpu_mpi_mpi_backend_shutdown,
+	._starpu_mpi_backend_reserve_core = _starpu_mpi_mpi_backend_reserve_core,
+	._starpu_mpi_backend_request_init = _starpu_mpi_mpi_backend_request_init,
+	._starpu_mpi_backend_request_fill = _starpu_mpi_mpi_backend_request_fill,
+	._starpu_mpi_backend_request_destroy = _starpu_mpi_mpi_backend_request_destroy,
+	._starpu_mpi_backend_data_clear = _starpu_mpi_mpi_backend_data_clear,
+	._starpu_mpi_backend_data_register = _starpu_mpi_mpi_backend_data_register,
+	._starpu_mpi_backend_comm_register = _starpu_mpi_mpi_backend_comm_register
+};
+
+#endif /* STARPU_USE_MPI_MPI*/

+ 80 - 0
mpi/src/mpi/starpu_mpi_mpi_backend.h

@@ -0,0 +1,80 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017                                     Inria
+ * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_MPI_BACKEND_H__
+#define __STARPU_MPI_MPI_BACKEND_H__
+
+#include <common/config.h>
+#include <common/uthash.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifdef STARPU_USE_MPI_MPI
+
+extern int _starpu_mpi_tag;
+#define _STARPU_MPI_TAG_ENVELOPE  _starpu_mpi_tag
+#define _STARPU_MPI_TAG_DATA      _starpu_mpi_tag+1
+#define _STARPU_MPI_TAG_SYNC_DATA _starpu_mpi_tag+2
+
+enum _starpu_envelope_mode
+{
+	_STARPU_MPI_ENVELOPE_DATA=0,
+	_STARPU_MPI_ENVELOPE_SYNC_READY=1
+};
+
+struct _starpu_mpi_envelope
+{
+	enum _starpu_envelope_mode mode;
+	starpu_ssize_t size;
+	starpu_mpi_tag_t data_tag;
+	unsigned sync;
+};
+
+struct _starpu_mpi_req_backend
+{
+	MPI_Request data_request;
+
+	starpu_pthread_mutex_t req_mutex;
+	starpu_pthread_cond_t req_cond;
+	starpu_pthread_mutex_t posted_mutex;
+	starpu_pthread_cond_t posted_cond;
+	/* In the case of a Wait/Test request, we are going to post a request
+	 * to test the completion of another request */
+	struct _starpu_mpi_req *other_request;
+
+	MPI_Request size_req;
+
+	struct _starpu_mpi_envelope* envelope;
+
+	unsigned is_internal_req:1;
+	unsigned to_destroy:1;
+	struct _starpu_mpi_req *internal_req;
+	struct _starpu_mpi_early_data_handle *early_data_handle;
+     	UT_hash_handle hh;
+};
+
+#endif // STARPU_USE_MPI_MPI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_MPI_BACKEND_H__

+ 6 - 6
mpi/src/mpi/starpu_mpi_sync_data.c

@@ -63,11 +63,11 @@ void _starpu_mpi_sync_data_handle_display_hash(struct _starpu_mpi_node_tag *node
 
 	if (hashlist == NULL)
 	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %ld does not exist\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
+		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %ld does not exist\n", (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag);
 	}
 	else if (_starpu_mpi_req_list_empty(&hashlist->list))
 	{
-		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %ld is empty\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
+		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %ld is empty\n", (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag);
 	}
 	else
 	{
@@ -76,7 +76,7 @@ void _starpu_mpi_sync_data_handle_display_hash(struct _starpu_mpi_node_tag *node
 		     cur != _starpu_mpi_req_list_end(&hashlist->list);
 		     cur = _starpu_mpi_req_list_next(cur))
 		{
-			_STARPU_MPI_DEBUG(60, "Element for comm %ld source %d and tag %ld: %p\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag, cur);
+			_STARPU_MPI_DEBUG(60, "Element for comm %ld source %d and tag %ld: %p\n", (long int)node_tag->node.comm, node_tag->node.rank, node_tag->data_tag, cur);
 		}
 	}
 }
@@ -99,8 +99,8 @@ struct _starpu_mpi_req *_starpu_mpi_sync_data_find(starpu_mpi_tag_t data_tag, in
 	struct _starpu_mpi_sync_data_handle_hashlist *found;
 
 	memset(&node_tag, 0, sizeof(struct _starpu_mpi_node_tag));
-	node_tag.comm = comm;
-	node_tag.rank = source;
+	node_tag.node.comm = comm;
+	node_tag.node.rank = source;
 	node_tag.data_tag = data_tag;
 
 	_STARPU_MPI_DEBUG(60, "Looking for sync_data_handle with comm %ld source %d tag %ld in the hashmap\n", (long int)comm, source, data_tag);
@@ -132,7 +132,7 @@ void _starpu_mpi_sync_data_add(struct _starpu_mpi_req *sync_req)
 {
 	struct _starpu_mpi_sync_data_handle_hashlist *hashlist;
 
-	_STARPU_MPI_DEBUG(2000, "Adding sync_req %p with comm %ld source %d tag %ld in the hashmap\n", sync_req, (long int)sync_req->node_tag.comm, sync_req->node_tag.rank, sync_req->node_tag.data_tag);
+	_STARPU_MPI_DEBUG(2000, "Adding sync_req %p with comm %ld source %d tag %ld in the hashmap\n", sync_req, (long int)sync_req->node_tag.node.comm, sync_req->node_tag.node.rank, sync_req->node_tag.data_tag);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_sync_data_handle_mutex);
 	HASH_FIND(hh, _starpu_mpi_sync_data_handle_hashmap, &sync_req->node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);

+ 3 - 3
mpi/src/mpi/starpu_mpi_tag.c

@@ -89,7 +89,7 @@ void _starpu_mpi_tag_data_register(starpu_data_handle_t handle, starpu_mpi_tag_t
 	STARPU_ASSERT_MSG(!(_starpu_mpi_tag_get_data_handle_from_tag(data_tag)),
 			  "There is already a data handle %p registered with the tag %ld\n", _starpu_mpi_tag_get_data_handle_from_tag(data_tag), data_tag);
 
-	_STARPU_MPI_DEBUG(42, "Adding handle %p with tag %"PRIi64"d in hashtable\n", handle, data_tag);
+	_STARPU_MPI_DEBUG(42, "Adding handle %p with tag %"PRIi64" in hashtable\n", handle, data_tag);
 
 	entry->handle = handle;
 	entry->data_tag = data_tag;
@@ -108,7 +108,7 @@ int _starpu_mpi_tag_data_release(starpu_data_handle_t handle)
 {
 	starpu_mpi_tag_t data_tag = starpu_mpi_data_get_tag(handle);
 
-	_STARPU_MPI_DEBUG(42, "Removing handle %p with tag %"PRIi64"d from hashtable\n", handle, data_tag);
+	_STARPU_MPI_DEBUG(42, "Removing handle %p with tag %"PRIi64" from hashtable\n", handle, data_tag);
 
 	if (data_tag != -1)
 	{
@@ -116,7 +116,7 @@ int _starpu_mpi_tag_data_release(starpu_data_handle_t handle)
 
 		_starpu_spin_lock(&registered_tag_handles_lock);
 		HASH_FIND(hh, registered_tag_handles, &(((struct _starpu_mpi_data *)(handle->mpi_data))->node_tag.data_tag), sizeof(tag_entry->data_tag), tag_entry);
-		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %"PRIi64"d isn't in the hashmap !", handle, data_tag);
+		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %"PRIi64" isn't in the hashmap !", handle, data_tag);
 
 		HASH_DEL(registered_tag_handles, tag_entry);
 

+ 51 - 51
mpi/src/nmad/starpu_mpi_nmad.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2017                                     Inria
- * Copyright (C) 2010-2015,2017,2018                      CNRS
+ * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
  * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
  * Copyright (C) 2017                                     Guillaume Beauchamp
  *
@@ -38,7 +38,7 @@
 
 #include <nm_sendrecv_interface.h>
 #include <nm_mpi_nmad.h>
-
+#include "starpu_mpi_nmad_backend.h"
 
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
 #ifdef STARPU_VERBOSE
@@ -88,30 +88,30 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(30, "post NM isend request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
+	_STARPU_MPI_DEBUG(30, "post NM isend request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
 
-	_starpu_mpi_comm_amounts_inc(req->node_tag.comm, req->node_tag.rank, req->datatype, req->count);
+	_starpu_mpi_comm_amounts_inc(req->node_tag.node.comm, req->node_tag.node.rank, req->datatype, req->count);
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag, 0);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag, 0);
 
 	struct nm_data_s data;
 	nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
-	nm_sr_send_init(req->session, &(req->data_request));
-	nm_sr_send_pack_data(req->session, &(req->data_request), &data);
-	nm_sr_send_set_priority(req->session, &req->data_request, req->prio);
+	nm_sr_send_init(req->backend->session, &(req->backend->data_request));
+	nm_sr_send_pack_data(req->backend->session, &(req->backend->data_request), &data);
+	nm_sr_send_set_priority(req->backend->session, &req->backend->data_request, req->prio);
 
 	if (req->sync == 0)
 	{
-		req->ret = nm_sr_send_isend(req->session, &(req->data_request), req->gate, req->node_tag.data_tag);
+		req->ret = nm_sr_send_isend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
 		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Isend returning %d", req->ret);
 	}
 	else
 	{
-		req->ret = nm_sr_send_issend(req->session, &(req->data_request), req->gate, req->node_tag.data_tag);
+		req->ret = nm_sr_send_issend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
 		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Issend returning %d", req->ret);
 	}
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
 
 	_starpu_mpi_handle_pending_request(req);
 
@@ -124,7 +124,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
 	if (req->registered_datatype == 1)
 	{
-		req->waited = 1;
+		req->backend->waited = 1;
 		req->count = 1;
 		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
 	}
@@ -132,7 +132,7 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 	{
 		starpu_ssize_t psize = -1;
 		int ret;
-		req->waited =2;
+		req->backend->waited =2;
 
 		// Do not pack the data, just try to find out the size
 		starpu_data_pack(req->data_handle, NULL, &psize);
@@ -140,12 +140,12 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		if (psize != -1)
 		{
 			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
+			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.node.rank);
 			req->count = psize;
-			//ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->size_req);
-			ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
+			//ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->backend->size_req);
+			ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
 
-			//	ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
+			//	ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->backend->size_req);
 			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
 		}
 
@@ -154,8 +154,8 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		if (psize == -1)
 		{
 			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %ld to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.rank);
-			ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %ld to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.node.rank);
+			ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
 			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
 		}
 		else
@@ -179,18 +179,18 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(20, "post NM irecv request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+	_STARPU_MPI_DEBUG(20, "post NM irecv request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
-	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	//req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
 	struct nm_data_s data;
 	nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
-	nm_sr_recv_init(req->session, &(req->data_request));
-	nm_sr_recv_unpack_data(req->session, &(req->data_request), &data);
-	nm_sr_recv_irecv(req->session, &(req->data_request), req->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
+	nm_sr_recv_init(req->backend->session, &(req->backend->data_request));
+	nm_sr_recv_unpack_data(req->backend->session, &(req->backend->data_request), &data);
+	nm_sr_recv_irecv(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
 
-	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	_starpu_mpi_handle_pending_request(req);
 
@@ -230,8 +230,8 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
 		callback->req = req;
 		starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
-		_STARPU_MPI_DEBUG(4, "Receiving size with tag %ld from node %d\n", req->node_tag.data_tag, req->node_tag.rank);
-		_starpu_mpi_irecv_common(callback->handle, req->node_tag.rank, req->node_tag.data_tag, req->node_tag.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1,0,0);
+		_STARPU_MPI_DEBUG(4, "Receiving size with tag %ld from node %d\n", req->node_tag.data_tag, req->node_tag.node.rank);
+		_starpu_mpi_irecv_common(callback->handle, req->node_tag.node.rank, req->node_tag.data_tag, req->node_tag.node.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1,0,0);
 	}
 
 }
@@ -243,7 +243,7 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 /********************************************************/
 
 #define _starpu_mpi_req_status(PUBLIC_REQ,STATUS) do {			\
-	STATUS->MPI_SOURCE=PUBLIC_REQ->node_tag.rank; /**< field name mandatory by spec */ \
+	STATUS->MPI_SOURCE=PUBLIC_REQ->node_tag.node.rank; /**< field name mandatory by spec */ \
 	STATUS->MPI_TAG=PUBLIC_REQ->node_tag.data_tag;    /**< field name mandatory by spec */ \
 	STATUS->MPI_ERROR=PUBLIC_REQ->ret;  /**< field name mandatory by spec */ \
 	STATUS->size=PUBLIC_REQ->count;       /**< size of data received */ \
@@ -259,9 +259,9 @@ int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
 	/* we must do a test_locked to avoid race condition :
 	 * without req_cond could still be used and couldn't be freed)*/
-	while (!req->completed || ! piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED))
+	while (!req->completed || ! piom_cond_test_locked(&(req->backend->req_cond),REQ_FINALIZED))
 	{
-		piom_cond_wait(&(req->req_cond),REQ_FINALIZED);
+		piom_cond_wait(&(req->backend->req_cond),REQ_FINALIZED);
 	}
 
 	if (status!=MPI_STATUS_IGNORE)
@@ -286,17 +286,17 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	struct _starpu_mpi_req *req = *public_req;
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
-	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	/* we must do a test_locked to avoid race condition :
 	 * without req_cond could still be used and couldn't be freed)*/
-	*flag = req->completed && piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED);
+	*flag = req->completed && piom_cond_test_locked(&(req->backend->req_cond),REQ_FINALIZED);
 	if (*flag && status!=MPI_STATUS_IGNORE)
 		_starpu_mpi_req_status(req,status);
 
-	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.node.rank, req->node_tag.data_tag);
 
 	if(*flag)
 	{
@@ -352,23 +352,23 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 	_STARPU_MPI_LOG_IN();
 
 	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 	{
 		if (req->registered_datatype == 0)
 		{
-			if(req->waited == 1)
+			if(req->backend->waited == 1)
 			        nm_mpi_nmad_data_release(req->datatype);
 			if (req->request_type == SEND_REQ)
 			{
-				req->waited--;
+				req->backend->waited--;
 				// We need to make sure the communication for sending the size
 				// has completed, as MPI can re-order messages, let's count
 				// recerived message.
 				// FIXME concurent access.
 				STARPU_ASSERT_MSG(event == NM_SR_EVENT_FINALIZED, "Callback with event %d", event);
-				if(req->waited>0)
+				if(req->backend->waited>0)
 					return;
 
 			}
@@ -384,7 +384,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 			_starpu_mpi_datatype_free(req->data_handle, &req->datatype);
 		}
 	}
-	_STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.rank, req->node_tag.data_tag);
+	_STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.node.rank, req->node_tag.data_tag);
 	_starpu_mpi_release_req_data(req);
 
 	/* Execute the specified callback, if any */
@@ -411,7 +411,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 			/* tell anyone potentially waiting on the request that it is
 			 * terminated now (should be done after the callback)*/
 			req->completed = 1;
-			piom_cond_signal(&req->req_cond, REQ_FINALIZED);
+			piom_cond_signal(&req->backend->req_cond, REQ_FINALIZED);
 		}
 		int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
 		if (!running && !pending_remaining)
@@ -427,16 +427,16 @@ void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const
 
 static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
 {
-	if(req->request_type == SEND_REQ && req->waited>1)
+	if(req->request_type == SEND_REQ && req->backend->waited>1)
 	{
-		nm_sr_request_set_ref(&(req->size_req), req);
-		nm_sr_request_monitor(req->session, &(req->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
+		nm_sr_request_set_ref(&(req->backend->size_req), req);
+		nm_sr_request_monitor(req->backend->session, &(req->backend->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
 	}
 	/* the if must be before, because the first callback can directly free
-	* a detached request (the second callback free if req->waited>1). */
-	nm_sr_request_set_ref(&(req->data_request), req);
+	* a detached request (the second callback free if req->backend->waited>1). */
+	nm_sr_request_set_ref(&(req->backend->data_request), req);
 
-	nm_sr_request_monitor(req->session, &(req->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
+	nm_sr_request_monitor(req->backend->session, &(req->backend->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
 }
 
 void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends)
@@ -453,7 +453,7 @@ void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, in
 	{
 		if (coop_sends->reqs_array[i]->request_type == SEND_REQ && submit_data)
 		{
-			_STARPU_MPI_DEBUG(0, "cooperative sends %p sending to %d\n", coop_sends, coop_sends->reqs_array[i]->node_tag.rank);
+			_STARPU_MPI_DEBUG(0, "cooperative sends %p sending to %d\n", coop_sends, coop_sends->reqs_array[i]->node_tag.node.rank);
 			_starpu_mpi_submit_ready_request(coop_sends->reqs_array[i]);
 		}
 		/* TODO: handle redirect requests */
@@ -468,7 +468,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 
 	/* submit the request to MPI directly from submitter */
 	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 	req->func(req);
 
 	_STARPU_MPI_LOG_OUT();
@@ -572,7 +572,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		else
 		{
 			c->req->completed=1;
-			piom_cond_signal(&(c->req->req_cond), REQ_FINALIZED);
+			piom_cond_signal(&(c->req->backend->req_cond), REQ_FINALIZED);
 		}
 		STARPU_ATOMIC_ADD( &pending_request, -1);
 		/* we signal that the request is completed.*/
@@ -685,12 +685,12 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 			(strcmp(s_idle_hooks, "HOOK")   == 0) ? PIOM_POLL_POINT_HOOK :
 			0;
 	}
-	
+
 	if(polling_point_prog)
 	{
 		starpu_progression_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_prog);
 	}
-	
+
 	if(polling_point_idle)
 	{
 		starpu_idle_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_idle);

+ 87 - 0
mpi/src/nmad/starpu_mpi_nmad_backend.c

@@ -0,0 +1,87 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017                                     Inria
+ * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include "starpu_mpi_nmad_backend.h"
+#include <starpu_mpi_private.h>
+
+#ifdef STARPU_USE_MPI_NMAD
+
+void _starpu_mpi_nmad_backend_init(struct starpu_conf *conf)
+{
+	(void)conf;
+	/* strat_prio is preferred for StarPU instead of default strat_aggreg */
+	setenv("NMAD_STRATEGY", "prio", 0 /* do not overwrite user-supplied value, if set */);
+}
+
+void _starpu_mpi_nmad_backend_shutdown(void)
+{
+}
+
+int _starpu_mpi_nmad_backend_reserve_core(void)
+{
+	return 1;
+}
+
+void _starpu_mpi_nmad_backend_request_init(struct _starpu_mpi_req *req)
+{
+	_STARPU_MPI_CALLOC(req->backend, 1, sizeof(struct _starpu_mpi_req_backend));
+	piom_cond_init(&req->backend->req_cond, 0);
+}
+
+void _starpu_mpi_nmad_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req)
+{
+	nm_mpi_nmad_dest(&req->backend->session, &req->backend->gate, comm, req->node_tag.node.rank);
+}
+
+void _starpu_mpi_nmad_backend_request_destroy(struct _starpu_mpi_req *req)
+{
+	piom_cond_destroy(&(req->backend->req_cond));
+	free(req->backend);
+}
+
+void _starpu_mpi_nmad_backend_data_clear(starpu_data_handle_t data_handle)
+{
+	(void)data_handle;
+}
+
+void _starpu_mpi_nmad_backend_data_register(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag)
+{
+	(void)data_handle;
+	(void)data_tag;
+}
+
+void _starpu_mpi_nmad_backend_comm_register(MPI_Comm comm)
+{
+	(void)comm;
+}
+
+struct _starpu_mpi_backend _mpi_backend =
+{
+ 	._starpu_mpi_backend_init = _starpu_mpi_nmad_backend_init,
+ 	._starpu_mpi_backend_shutdown = _starpu_mpi_nmad_backend_shutdown,
+	._starpu_mpi_backend_reserve_core = _starpu_mpi_nmad_backend_reserve_core,
+	._starpu_mpi_backend_request_init = _starpu_mpi_nmad_backend_request_init,
+	._starpu_mpi_backend_request_fill = _starpu_mpi_nmad_backend_request_fill,
+	._starpu_mpi_backend_request_destroy = _starpu_mpi_nmad_backend_request_destroy,
+	._starpu_mpi_backend_data_clear = _starpu_mpi_nmad_backend_data_clear,
+	._starpu_mpi_backend_data_register = _starpu_mpi_nmad_backend_data_register,
+	._starpu_mpi_backend_comm_register = _starpu_mpi_nmad_backend_comm_register
+};
+
+#endif /* STARPU_USE_MPI_NMAD*/

+ 51 - 0
mpi/src/nmad/starpu_mpi_nmad_backend.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2017                                     Inria
+ * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
+ * Copyright (C) 2009-2014,2017,2018-2019                 Université de Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_NMAD_BACKEND_H__
+#define __STARPU_MPI_NMAD_BACKEND_H__
+
+#include <common/config.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifdef STARPU_USE_MPI_NMAD
+
+#include <nm_sendrecv_interface.h>
+#include <nm_session_interface.h>
+#include <nm_mpi_nmad.h>
+
+struct _starpu_mpi_req_backend
+{
+	nm_gate_t gate;
+	nm_session_t session;
+	nm_sr_request_t data_request;
+	int waited;
+	piom_cond_t req_cond;
+	nm_sr_request_t size_req;
+};
+
+#endif // STARPU_USE_MPI_NMAD
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_NMAD_BACKEND_H__

+ 9 - 26
mpi/src/starpu_mpi.c

@@ -36,11 +36,6 @@
 #include <core/topology.h>
 #include <core/workers.h>
 
-#if defined(STARPU_USE_MPI_MPI)
-#include <mpi/starpu_mpi_comm.h>
-#include <mpi/starpu_mpi_tag.h>
-#endif
-
 static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
 {
 	/* Asynchronously request StarPU to fetch the data in main memory: when
@@ -49,10 +44,7 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 	starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
 }
 
-static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
-							int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm,
-							unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
-							int sequential_consistency)
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
 {
 	if (_starpu_mpi_fake_world_size != -1)
 	{
@@ -66,9 +58,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 	enum starpu_data_access_mode mode = STARPU_R;
 #endif
 
-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(
-	                                      data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func,
-					      sequential_consistency, 0, 0);
+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, sequential_consistency, 0, 0);
 	_starpu_mpi_req_willpost(req);
 
 	if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
@@ -253,9 +243,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 {
-#if defined(STARPU_USE_MPI_MPI)
-	_starpu_mpi_tag_data_release(data_handle);
-#endif
+	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
 	free(data_handle->mpi_data);
 	data_handle->mpi_data = NULL;
@@ -273,8 +261,8 @@ struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)
 		_STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
 		mpi_data->magic = 42;
 		mpi_data->node_tag.data_tag = -1;
-		mpi_data->node_tag.rank = -1;
-		mpi_data->node_tag.comm = MPI_COMM_WORLD;
+		mpi_data->node_tag.node.rank = -1;
+		mpi_data->node_tag.node.comm = MPI_COMM_WORLD;
 		_starpu_spin_init(&mpi_data->coop_lock);
 		data_handle->mpi_data = mpi_data;
 		_starpu_mpi_cache_data_init(data_handle);
@@ -289,19 +277,14 @@ void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_
 
 	if (data_tag != -1)
 	{
-#if defined(STARPU_USE_MPI_MPI)
-		_starpu_mpi_tag_data_register(data_handle, data_tag);
-#endif
+		_mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
 		mpi_data->node_tag.data_tag = data_tag;
 	}
 	if (rank != -1)
 	{
 		_STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
-		mpi_data->node_tag.rank = rank;
-		mpi_data->node_tag.comm = comm;
-#if defined(STARPU_USE_MPI_MPI)
-		_starpu_mpi_comm_register(comm);
-#endif
+		mpi_data->node_tag.node.rank = rank;
+		mpi_data->node_tag.node.comm = comm;
 	}
 }
 
@@ -318,7 +301,7 @@ void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_
 int starpu_mpi_data_get_rank(starpu_data_handle_t data)
 {
 	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
-	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.rank;
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.node.rank;
 }
 
 starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)

+ 2 - 2
mpi/src/starpu_mpi_cache.c

@@ -260,7 +260,7 @@ void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
 		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
-	starpu_mpi_comm_size(mpi_data->node_tag.comm, &size);
+	starpu_mpi_comm_size(mpi_data->node_tag.node.comm, &size);
 	for(n=0 ; n<size ; n++)
 	{
 		if (mpi_data->cache_sent[n] == 1)
@@ -326,7 +326,7 @@ static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
 	if (_starpu_cache_enabled == 0)
 		return;
 
-	starpu_mpi_comm_size(mpi_data->node_tag.comm, &nb_nodes);
+	starpu_mpi_comm_size(mpi_data->node_tag.node.comm, &nb_nodes);
 	for(i=0 ; i<nb_nodes ; i++)
 	{
 		if (mpi_data->cache_sent[i] == 1)

+ 5 - 5
mpi/src/starpu_mpi_coop_sends.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2012,2013,2016,2017                      Inria
  *
@@ -184,7 +184,7 @@ static int _starpu_mpi_coop_send_compatible(struct _starpu_mpi_req *req, struct
 
 	prevreq = _starpu_mpi_req_multilist_begin_coop_sends(&coop_sends->reqs);
 	return /* we can cope with tag being different */
-	          prevreq->node_tag.comm == req->node_tag.comm
+	          prevreq->node_tag.node.comm == req->node_tag.node.comm
 	       && prevreq->sequential_consistency == req->sequential_consistency;
 }
 
@@ -212,7 +212,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 					tofree = coop_sends;
 				}
 				coop_sends = mpi_data->coop_sends;
-				_STARPU_MPI_DEBUG(0, "%p: add to cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.rank);
+				_STARPU_MPI_DEBUG(0, "%p: add to cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.node.rank);
 				_starpu_mpi_req_multilist_push_back_coop_sends(&coop_sends->reqs, req);
 				coop_sends->n++;
 				req->coop_sends_head = coop_sends;
@@ -222,7 +222,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 			else
 			{
 				/* Nope, incompatible, put ours instead */
-				_STARPU_MPI_DEBUG(0, "%p: new cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.rank);
+				_STARPU_MPI_DEBUG(0, "%p: new cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.node.rank);
 				mpi_data->coop_sends = coop_sends;
 				first = 1;
 				_starpu_spin_unlock(&mpi_data->coop_lock);
@@ -234,7 +234,7 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 		else if (coop_sends)
 		{
 			/* Nobody else and we have allocated one, we're first! */
-			_STARPU_MPI_DEBUG(0, "%p: new cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.rank);
+			_STARPU_MPI_DEBUG(0, "%p: new cooperative sends %p, dest %d\n", data_handle, coop_sends, req->node_tag.node.rank);
 			mpi_data->coop_sends = coop_sends;
 			first = 1;
 			done = 1;

+ 4 - 20
mpi/src/starpu_mpi_init.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2009-2018                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,12 +32,6 @@
 #include <core/simgrid.h>
 #include <core/task.h>
 
-#if defined(STARPU_USE_MPI_MPI)
-#include <mpi/starpu_mpi_comm.h>
-#include <mpi/starpu_mpi_tag.h>
-#include <mpi/starpu_mpi_driver.h>
-#endif
-
 #ifdef STARPU_SIMGRID
 static int _mpi_world_size;
 static int _mpi_world_rank;
@@ -75,10 +69,6 @@ void _starpu_mpi_do_initialize(struct _starpu_mpi_argc_argv *argc_argv)
 	{
 		STARPU_ASSERT_MSG(argc_argv->comm == MPI_COMM_WORLD, "It does not make sense to ask StarPU-MPI to initialize MPI while a non-world communicator was given");
 		int thread_support;
-#ifdef STARPU_USE_MPI_NMAD
-		/* strat_prio is preferred for StarPU instead of default strat_aggreg */
-		setenv("NMAD_STRATEGY", "prio", 0 /* do not overwrite user-supplied value, if set */);
-#endif /* STARPU_USE_MPI_NMAD */
 		_STARPU_DEBUG("Calling MPI_Init_thread\n");
 		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
 		{
@@ -189,11 +179,9 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 		conf = &localconf;
 	}
 
-#if defined(STARPU_USE_MPI_MPI)
-	_starpu_mpi_driver_init(conf);
+	_mpi_backend._starpu_mpi_backend_init(conf);
 
-	if (starpu_get_env_number_default("STARPU_MPI_DRIVER_CALL_FREQUENCY", 0) <= 0)
-#endif
+	if (_mpi_backend._starpu_mpi_backend_reserve_core())
 	{
 		/* Reserve a core for our progression thread */
 		if (conf->reserve_ncpus == -1)
@@ -227,11 +215,7 @@ int starpu_mpi_shutdown(void)
 	_starpu_mpi_comm_amounts_display(stderr, rank);
 	_starpu_mpi_comm_amounts_shutdown();
 	_starpu_mpi_cache_shutdown(world_size);
-#if defined(STARPU_USE_MPI_MPI)
-	_starpu_mpi_tag_shutdown();
-	_starpu_mpi_comm_shutdown();
-	_starpu_mpi_driver_shutdown();
-#endif
+
 	if (_mpi_initialized_starpu)
 		starpu_shutdown();
 

+ 48 - 84
mpi/src/starpu_mpi_private.h

@@ -28,11 +28,6 @@
 #include <common/prio_list.h>
 #include <common/starpu_spinlock.h>
 #include <core/simgrid.h>
-#if defined(STARPU_USE_MPI_NMAD)
-#include <pioman.h>
-#include <nm_sendrecv_interface.h>
-#include <nm_session_interface.h>
-#endif
 
 #ifdef __cplusplus
 extern "C"
@@ -52,7 +47,7 @@ struct _starpu_simgrid_mpi_req
 };
 
 int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag);
-void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
+void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
 #endif
 
 extern int _starpu_debug_rank;
@@ -73,7 +68,7 @@ extern int _starpu_mpi_use_coop_sends;
 void _starpu_mpi_env_init(void);
 
 #ifdef STARPU_NO_ASSERT
-#  define STARPU_MPI_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); }} while(0)
+#  define STARPU_MPI_ASSERT_MSG(x, msg, ...) do { if (0) { (void) (x); }} while(0)
 #else
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
 int _starpu_debug_rank;
@@ -107,32 +102,32 @@ int _starpu_debug_rank;
 
 #ifdef STARPU_MPI_VERBOSE
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
-	do								\
-	{							\
-	     	if (_starpu_mpi_comm_debug)			\
-		{					\
-     			int __size;			\
-			char _comm_name[128];		\
-			int _comm_name_len;		\
-			int _rank;			    \
+	do \
+	{ \
+	     	if (_starpu_mpi_comm_debug) \
+		{ \
+     			int __size; \
+			char _comm_name[128]; \
+			int _comm_name_len; \
+			int _rank; \
 			starpu_mpi_comm_rank(comm, &_rank); \
-			MPI_Type_size(datatype, &__size);		\
+			MPI_Type_size(datatype, &__size); \
 			MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
 			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
-			fflush(stderr);					\
-		}							\
+			fflush(stderr);	\
+		} \
 	} while(0);
-#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) 	    _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
 #  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
 	do \
 	{								\
 		if (!_starpu_silent && _starpu_debug_level_min <= level && level <= _starpu_debug_level_max)	\
-		{							\
+		{ \
 			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
 			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
 			fflush(stderr); \
-		}			\
+		} \
 	} while(0);
 #else
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
@@ -163,24 +158,6 @@ int _starpu_debug_rank;
 #  define _STARPU_MPI_LOG_OUT()
 #endif
 
-#if defined(STARPU_USE_MPI_MPI)
-extern int _starpu_mpi_tag;
-#define _STARPU_MPI_TAG_ENVELOPE  _starpu_mpi_tag
-#define _STARPU_MPI_TAG_DATA      _starpu_mpi_tag+1
-#define _STARPU_MPI_TAG_SYNC_DATA _starpu_mpi_tag+2
-
-#define _STARPU_MPI_ENVELOPE_DATA       0
-#define _STARPU_MPI_ENVELOPE_SYNC_READY 1
-
-struct _starpu_mpi_envelope
-{
-	int mode;
-	starpu_ssize_t size;
-	starpu_mpi_tag_t data_tag;
-	unsigned sync;
-};
-#endif /* STARPU_USE_MPI_MPI */
-
 enum _starpu_mpi_request_type
 {
 	SEND_REQ=0,
@@ -192,10 +169,15 @@ enum _starpu_mpi_request_type
 	UNKNOWN_REQ=6,
 };
 
-struct _starpu_mpi_node_tag
+struct _starpu_mpi_node
 {
 	MPI_Comm comm;
 	int rank;
+};
+
+struct _starpu_mpi_node_tag
+{
+	struct _starpu_mpi_node node;
 	starpu_mpi_tag_t data_tag;
 };
 
@@ -229,6 +211,7 @@ struct _starpu_mpi_data
 
 struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle);
 
+struct _starpu_mpi_req_backend;
 struct _starpu_mpi_req;
 LIST_TYPE(_starpu_mpi_req,
 	/* description of the data at StarPU level */
@@ -243,22 +226,13 @@ LIST_TYPE(_starpu_mpi_req,
 	starpu_ssize_t count;
 	int registered_datatype;
 
+	struct _starpu_mpi_req_backend *backend;
+
 	/* who are we talking to ? */
 	struct _starpu_mpi_node_tag node_tag;
-#if defined(STARPU_USE_MPI_NMAD)
-	nm_gate_t gate;
-	nm_session_t session;
-#endif
-
 	void (*func)(struct _starpu_mpi_req *);
 
 	MPI_Status *status;
-#if defined(STARPU_USE_MPI_NMAD)
-	nm_sr_request_t data_request;
-	int waited;
-#elif defined(STARPU_USE_MPI_MPI)
-	MPI_Request data_request;
-#endif
 	struct _starpu_mpi_req_multilist_coop_sends coop_sends;
 	struct _starpu_mpi_coop_sends *coop_sends_head;
 
@@ -266,17 +240,6 @@ LIST_TYPE(_starpu_mpi_req,
 	unsigned sync;
 
 	int ret;
-#if defined(STARPU_USE_MPI_NMAD)
-	piom_cond_t req_cond;
-#elif defined(STARPU_USE_MPI_MPI)
-	starpu_pthread_mutex_t req_mutex;
-	starpu_pthread_cond_t req_cond;
-	starpu_pthread_mutex_t posted_mutex;
-	starpu_pthread_cond_t posted_cond;
-	/* In the case of a Wait/Test request, we are going to post a request
-	 * to test the completion of another request */
-	struct _starpu_mpi_req *other_request;
-#endif
 
 	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
 
@@ -290,21 +253,6 @@ LIST_TYPE(_starpu_mpi_req,
 	void (*callback)(void *);
 
         /* in the case of user-defined datatypes, we need to send the size of the data */
-#if defined(STARPU_USE_MPI_NMAD)
-	nm_sr_request_t size_req;
-#elif defined(STARPU_USE_MPI_MPI)
-	MPI_Request size_req;
-#endif
-
-#if defined(STARPU_USE_MPI_MPI)
-	struct _starpu_mpi_envelope* envelope;
-
-	unsigned is_internal_req:1;
-	unsigned to_destroy:1;
-	struct _starpu_mpi_req *internal_req;
-	struct _starpu_mpi_early_data_handle *early_data_handle;
-     	UT_hash_handle hh;
-#endif
 
 	int sequential_consistency;
 
@@ -346,13 +294,12 @@ void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, in
 void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req);
 void _starpu_mpi_request_init(struct _starpu_mpi_req **req);
 struct _starpu_mpi_req * _starpu_mpi_request_fill(starpu_data_handle_t data_handle,
-						       int srcdst, starpu_mpi_tag_t data_tag, MPI_Comm comm,
-						       unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
-						       enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
-						       int sequential_consistency,
-						       int is_internal_req,
-						       starpu_ssize_t count);
-
+						  int srcdst, starpu_mpi_tag_t data_tag, MPI_Comm comm,
+						  unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
+						  enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
+						  int sequential_consistency,
+						  int is_internal_req,
+						  starpu_ssize_t count);
 
 void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req);
 void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
@@ -380,6 +327,23 @@ void _starpu_mpi_wait_for_initialization();
 #endif
 void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
 
+/*
+ * Specific functions to backend implementation
+ */
+struct _starpu_mpi_backend
+{
+	void (*_starpu_mpi_backend_init)(struct starpu_conf *conf);
+	void (*_starpu_mpi_backend_shutdown)(void);
+	int (*_starpu_mpi_backend_reserve_core)(void);
+	void (*_starpu_mpi_backend_request_init)(struct _starpu_mpi_req *req);
+	void (*_starpu_mpi_backend_request_fill)(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req);
+	void (*_starpu_mpi_backend_request_destroy)(struct _starpu_mpi_req *req);
+	void (*_starpu_mpi_backend_data_clear)(starpu_data_handle_t data_handle);
+	void (*_starpu_mpi_backend_data_register)(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag);
+	void (*_starpu_mpi_backend_comm_register)(MPI_Comm comm);
+};
+
+extern struct _starpu_mpi_backend _mpi_backend;
 #ifdef __cplusplus
 }
 #endif

+ 9 - 54
mpi/src/starpu_mpi_req.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2012,2013,2016,2017                      Inria
  * Copyright (C) 2017                                     Guillaume Beauchamp
@@ -19,13 +19,6 @@
 
 #include <starpu.h>
 #include <starpu_mpi_private.h>
-#if defined(STARPU_USE_MPI_MPI)
-#include <mpi/starpu_mpi_comm.h>
-#endif
-#if defined(STARPU_USE_MPI_NMAD)
-#include <pioman.h>
-#include <nm_mpi_nmad.h>
-#endif
 
 void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 {
@@ -41,28 +34,17 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	(*req)->count = -1;
 	(*req)->registered_datatype = -1;
 
-	(*req)->node_tag.rank = -1;
+	(*req)->node_tag.node.rank = -1;
 	(*req)->node_tag.data_tag = -1;
-	(*req)->node_tag.comm = 0;
+	(*req)->node_tag.node.comm = 0;
 
 	(*req)->func = NULL;
 
 	(*req)->status = NULL;
-#ifdef STARPU_USE_MPI_MPI
-	(*req)->data_request = 0;
-#endif
 	(*req)->flag = NULL;
 	_starpu_mpi_req_multilist_init_coop_sends(*req);
 
 	(*req)->ret = -1;
-#ifdef STARPU_USE_MPI_NMAD
-	piom_cond_init(&((*req)->req_cond), 0);
-#elif defined(STARPU_USE_MPI_MPI)
-	STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
-	STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
-	STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
-	STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
-#endif
 
 	(*req)->request_type = UNKNOWN_REQ;
 
@@ -70,23 +52,11 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	(*req)->completed = 0;
 	(*req)->posted = 0;
 
-#ifdef STARPU_USE_MPI_MPI
-	(*req)->other_request = NULL;
-#endif
-
 	(*req)->sync = 0;
 	(*req)->detached = -1;
 	(*req)->callback = NULL;
 	(*req)->callback_arg = NULL;
 
-#ifdef STARPU_USE_MPI_MPI
-	(*req)->size_req = 0;
-	(*req)->internal_req = NULL;
-	(*req)->is_internal_req = 0;
-	(*req)->to_destroy = 1;
-	(*req)->early_data_handle = NULL;
-	(*req)->envelope = NULL;
-#endif
 	(*req)->sequential_consistency = 1;
 	(*req)->pre_sync_jobid = -1;
 	(*req)->post_sync_jobid = -1;
@@ -96,6 +66,7 @@ void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	starpu_pthread_queue_register(&_starpu_mpi_thread_wait, &((*req)->queue));
 	(*req)->done = 0;
 #endif
+	_mpi_backend._starpu_mpi_backend_request_init(*req);
 }
 
 struct _starpu_mpi_req *_starpu_mpi_request_fill(starpu_data_handle_t data_handle,
@@ -108,10 +79,6 @@ struct _starpu_mpi_req *_starpu_mpi_request_fill(starpu_data_handle_t data_handl
 {
 	struct _starpu_mpi_req *req;
 
-#ifdef STARPU_USE_MPI_MPI
-	_starpu_mpi_comm_register(comm);
-#endif
-
 	/* Initialize the request structure */
 	_starpu_mpi_request_init(&req);
 	req->request_type = request_type;
@@ -119,39 +86,27 @@ struct _starpu_mpi_req *_starpu_mpi_request_fill(starpu_data_handle_t data_handl
 	if (_starpu_mpi_use_prio)
 		req->prio = prio;
 	req->data_handle = data_handle;
-	req->node_tag.rank = srcdst;
+	req->node_tag.node.rank = srcdst;
 	req->node_tag.data_tag = data_tag;
-	req->node_tag.comm = comm;
+	req->node_tag.node.comm = comm;
 	req->detached = detached;
 	req->sync = sync;
 	req->callback = callback;
 	req->callback_arg = arg;
 	req->func = func;
 	req->sequential_consistency = sequential_consistency;
-#ifdef STARPU_USE_MPI_NMAD
-	nm_mpi_nmad_dest(&req->session, &req->gate, comm, req->node_tag.rank);
-#elif defined(STARPU_USE_MPI_MPI)
-	req->is_internal_req = is_internal_req;
-	/* For internal requests, we wait for both the request completion and the matching application request completion */
-	req->to_destroy = !is_internal_req;
 	req->count = count;
-#endif
+
+	_mpi_backend._starpu_mpi_backend_request_fill(req, comm, is_internal_req);
 
 	return req;
 }
 
 void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
 {
-#ifdef STARPU_USE_MPI_NMAD
-	piom_cond_destroy(&(req->req_cond));
-#elif defined(STARPU_USE_MPI_MPI)
-	STARPU_PTHREAD_MUTEX_DESTROY(&req->req_mutex);
-	STARPU_PTHREAD_COND_DESTROY(&req->req_cond);
-	STARPU_PTHREAD_MUTEX_DESTROY(&req->posted_mutex);
-	STARPU_PTHREAD_COND_DESTROY(&req->posted_cond);
+	_mpi_backend._starpu_mpi_backend_request_destroy(req);
 	free(req->datatype_name);
 	req->datatype_name = NULL;
-#endif
 #ifdef STARPU_SIMGRID
 	starpu_pthread_queue_unregister(&_starpu_mpi_thread_wait, &req->queue);
 	starpu_pthread_queue_destroy(&req->queue);

+ 43 - 2
mpi/src/starpu_mpi_task_insert.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2014,2016,2017                      Inria
  * Copyright (C) 2011-2019                                CNRS
- * Copyright (C) 2011-2018                                Université de Bordeaux
+ * Copyright (C) 2011-2019                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -375,10 +375,19 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			prio = va_arg(varg_list_copy, int);
@@ -411,6 +420,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
                 {
                         (void)va_arg(varg_list_copy, void *);
                 }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+                }
                 else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
                 {
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
@@ -419,6 +432,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
                 {
                         (void)va_arg(varg_list_copy, void *);
 		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_EXECUTE_WHERE)
 		{
 			// the flag is decoded and set later when
@@ -469,6 +486,27 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			(void)va_arg(varg_list_copy, int);
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+			(void)va_arg(varg_list_copy, uint32_t*);
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			(void)va_arg(varg_list_copy, struct starpu_profiling_task_info *);
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_TASK_SCHED_DATA)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -552,6 +590,9 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 
 		*task = starpu_task_create();
 		(*task)->cl_arg_free = 1;
+		(*task)->callback_arg_free = 1;
+		(*task)->prologue_callback_arg_free = 1;
+		(*task)->prologue_callback_pop_arg_free = 1;
 
 		va_copy(varg_list_copy, varg_list);
 		_starpu_task_insert_create(codelet, *task, varg_list_copy);
@@ -824,7 +865,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				// Submit taskA
 				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
 						   STARPU_R, data_handle,
-						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
+						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
 						   0);
 			}
 		}

+ 72 - 24
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016-2019                                CNRS
- * Copyright (C) 2017,2018                                Université de Bordeaux
+ * Copyright (C) 2017,2018-2019                                Université de Bordeaux
  * Copyright (C) 2016                                     Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -210,11 +210,23 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			arg_i++;
+			/* _starpu_callback_func_t */
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			prio = *(int *)arglist[arg_i];
@@ -255,6 +267,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
                 }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+                {
+			arg_i++;
+			/* void* */
+                }
                 else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
                 {
 			arg_i++;
@@ -265,6 +282,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+                {
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_EXECUTE_WHERE)
 		{
 			arg_i++;
@@ -320,6 +342,33 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* int */
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			arg_i++;
+			/* unsigned */
+			arg_i++;
+			/* uint32_t* */
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			arg_i++;
+			/* unsigned */
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			arg_i++;
+			/* struct starpu_profiling_task_info * */
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			arg_i++;
+			/* unsigned */
+		}
+		else if (arg_type==STARPU_TASK_SCHED_DATA)
+		{
+			arg_i++;
+			/* void * */
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -400,6 +449,9 @@ int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, str
 
 		*task = starpu_task_create();
 		(*task)->cl_arg_free = 1;
+		(*task)->callback_arg_free = 1;
+		(*task)->prologue_callback_arg_free = 1;
+		(*task)->prologue_callback_pop_arg_free = 1;
 
 		_fstarpu_task_insert_create(codelet, *task, arglist);
 		return 0;
@@ -441,26 +493,26 @@ int _fstarpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, vo
 	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 }
 
-int fstarpu_mpi_task_insert(MPI_Fint comm, void ***_arglist)
+void fstarpu_mpi_task_insert(void **arglist)
 {
-	void **arglist = *_arglist;
-	struct starpu_codelet *codelet = arglist[0];
+	MPI_Fint comm = *((MPI_Fint *)arglist[0]);
+	struct starpu_codelet *codelet = arglist[1];
 	if (codelet == NULL)
 	{
 		STARPU_ABORT_MSG("task without codelet");
 	}
-	int ret;
 
-	ret = _fstarpu_mpi_task_insert_v(MPI_Comm_f2c(comm), codelet, arglist+1);
-	return ret;
+	int ret;
+	ret = _fstarpu_mpi_task_insert_v(MPI_Comm_f2c(comm), codelet, arglist+2);
+	STARPU_ASSERT(ret >= 0);
 }
 
 /* fstarpu_mpi_insert_task: aliased to fstarpu_mpi_task_insert in fstarpu_mpi_mod.f90 */
 
-struct starpu_task *fstarpu_mpi_task_build(MPI_Fint comm, void ***_arglist)
+struct starpu_task *fstarpu_mpi_task_build(void **arglist)
 {
-	void **arglist = *_arglist;
-	struct starpu_codelet *codelet = arglist[0];
+	MPI_Fint comm = *((MPI_Fint *)arglist[0]);
+	struct starpu_codelet *codelet = arglist[1];
 	if (codelet == NULL)
 	{
 		STARPU_ABORT_MSG("task without codelet");
@@ -468,38 +520,34 @@ struct starpu_task *fstarpu_mpi_task_build(MPI_Fint comm, void ***_arglist)
 	struct starpu_task *task;
 	int ret;
 
-	ret = _fstarpu_mpi_task_build_v(MPI_Comm_f2c(comm), codelet, &task, NULL, NULL, NULL, NULL, arglist+1);
+	ret = _fstarpu_mpi_task_build_v(MPI_Comm_f2c(comm), codelet, &task, NULL, NULL, NULL, NULL, arglist+2);
 	STARPU_ASSERT(ret >= 0);
 	return (ret > 0) ? NULL : task;
 }
 
-int fstarpu_mpi_task_post_build(MPI_Fint _comm, void ***_arglist)
+void fstarpu_mpi_task_post_build(void **arglist)
 {
-	void **arglist = *_arglist;
-	struct starpu_codelet *codelet = arglist[0];
+	MPI_Fint comm = *((MPI_Fint *)arglist[0]);
+	struct starpu_codelet *codelet = arglist[1];
 	if (codelet == NULL)
 	{
 		STARPU_ABORT_MSG("task without codelet");
 	}
-	MPI_Comm comm = MPI_Comm_f2c(_comm);
 	int xrank, do_execute;
 	int ret, me, nb_nodes;
 	struct starpu_data_descr *descrs;
 	int nb_data;
 	int prio;
 
-	starpu_mpi_comm_rank(comm, &me);
-	starpu_mpi_comm_size(comm, &nb_nodes);
+	starpu_mpi_comm_rank(MPI_Comm_f2c(comm), &me);
+	starpu_mpi_comm_size(MPI_Comm_f2c(comm), &nb_nodes);
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist);
-	if (ret < 0)
-		return ret;
+	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist+2);
+	STARPU_ASSERT(ret >= 0);
 
-	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
+	ret = _starpu_mpi_task_postbuild_v(MPI_Comm_f2c(comm), xrank, do_execute, descrs, nb_data, prio);
+	STARPU_ASSERT(ret >= 0);
 }
 
 #endif /* HAVE_MPI_COMM_F2C */
-
-
-

+ 3 - 1
mpi/tests/Makefile.am

@@ -118,7 +118,8 @@ starpu_mpi_TESTS +=				\
 	policy_selection			\
 	policy_selection2			\
 	ring_async_implicit			\
-	temporary
+	temporary				\
+	early_stuff
 
 if !STARPU_SIMGRID
 starpu_mpi_TESTS +=				\
@@ -183,6 +184,7 @@ noinst_PROGRAMS =				\
 	ring_async				\
 	ring_async_implicit			\
 	temporary				\
+	early_stuff				\
 	block_interface				\
 	block_interface_pinned			\
 	attr					\

+ 3 - 3
mpi/tests/attr.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2017                                     Inria
- * Copyright (C) 2017,2018                                CNRS
+ * Copyright (C) 2017,2018,2019                           CNRS
  * Copyright (C) 2017,2018                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,8 +33,8 @@ int main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
 	STARPU_ASSERT_MSG(flag == 1, "starpu_mpi_comm_get_attr was called with valid argument\n");
 
 	rvalue = *value;
-	FPRINTF(stderr, "Value: %"PRIi64"d\n", *value);
-	FPRINTF(stderr, "Value: %"PRIi64"d\n", rvalue);
+	FPRINTF(stderr, "Value: %"PRIi64"\n", *value);
+	FPRINTF(stderr, "Value: %"PRIi64"\n", rvalue);
 
 	return 0;
 }

+ 8 - 3
mpi/tests/block_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2011,2014,2015,2017,2018            Université de Bordeaux
  * Copyright (C) 2013                                     Inria
- * Copyright (C) 2010-2012,2014,2015,2017                 CNRS
+ * Copyright (C) 2010-2012,2014,2015,2017,2019            CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,11 @@
 #include <stdlib.h>
 #include "helper.h"
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 
 #define BIGSIZE	128
 #define SIZE	64
@@ -44,7 +48,8 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		starpu_mpi_shutdown();
-		MPI_Finalize();
+		if (!mpi_init)
+			MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 

+ 7 - 3
mpi/tests/block_interface_pinned.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2011,2014,2015,2017,2018            Université de Bordeaux
  * Copyright (C) 2013                                     Inria
- * Copyright (C) 2010-2012,2015,2017                      CNRS
+ * Copyright (C) 2010-2012,2015,2017,2019                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,9 +20,13 @@
 #include <stdlib.h>
 #include "helper.h"
 
-#define NITER	2048
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
 
-#define BIGSIZE	64
+#define BIGSIZE	128
 #define SIZE	64
 
 int main(int argc, char **argv)

+ 5 - 5
mpi/tests/callback.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2015,2017                           CNRS
+ * Copyright (C) 2013-2015,2017,2019                      CNRS
  * Copyright (C) 2014,2015,2017,2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
-				     STARPU_CALLBACK_WITH_ARG, callback, &x,
+				     STARPU_CALLBACK_WITH_ARG_NFREE, callback, &x,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
@@ -81,7 +81,7 @@ int main(int argc, char **argv)
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_CALLBACK, callback,
-				     STARPU_CALLBACK_ARG, &x,
+				     STARPU_CALLBACK_ARG_NFREE, &x,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_PROLOGUE_CALLBACK, prologue_callback,
-				     STARPU_PROLOGUE_CALLBACK_ARG, &y,
+				     STARPU_PROLOGUE_CALLBACK_ARG_NFREE, &y,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
@@ -103,7 +103,7 @@ int main(int argc, char **argv)
 				     &my_codelet,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_PROLOGUE_CALLBACK_POP, prologue_callback,
-				     STARPU_PROLOGUE_CALLBACK_POP_ARG, &y,
+				     STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE, &y,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 

+ 125 - 0
mpi/tests/early_stuff.c

@@ -0,0 +1,125 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+#ifndef STARPU_USE_MPI_MPI
+int main(int argc, char **argv)
+{
+	int mpi_init;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}
+
+#else
+
+#include <mpi/starpu_mpi_early_data.h>
+#include <mpi/starpu_mpi_early_request.h>
+#include <mpi/starpu_mpi_mpi_backend.h>
+
+void early_data()
+{
+	struct _starpu_mpi_early_data_handle *edh[2];
+	struct _starpu_mpi_envelope envelope[2];
+	struct _starpu_mpi_node_tag node_tag[2];
+	struct _starpu_mpi_early_data_handle *early;
+	struct _starpu_mpi_early_data_handle_tag_hashlist *hash;
+
+	memset(&node_tag[0], 0, sizeof(struct _starpu_mpi_node_tag));
+	node_tag[0].node.rank = 1;
+	node_tag[0].node.comm = MPI_COMM_WORLD;
+	node_tag[0].data_tag = 42;
+
+	memset(&node_tag[1], 0, sizeof(struct _starpu_mpi_node_tag));
+	node_tag[1].node.rank = 2;
+	node_tag[1].node.comm = MPI_COMM_WORLD;
+	node_tag[1].data_tag = 84;
+
+	envelope[0].data_tag = node_tag[0].data_tag;
+	edh[0] = _starpu_mpi_early_data_create(&envelope[0], node_tag[0].node.rank, node_tag[0].node.comm);
+
+	envelope[1].data_tag = node_tag[1].data_tag;
+	edh[1] = _starpu_mpi_early_data_create(&envelope[1], node_tag[1].node.rank, node_tag[1].node.comm);
+
+	_starpu_mpi_early_data_add(edh[0]);
+	_starpu_mpi_early_data_add(edh[1]);
+
+	hash = _starpu_mpi_early_data_extract(&node_tag[1]);
+	STARPU_ASSERT(_starpu_mpi_early_data_handle_list_size(&hash->list) == 1);
+	early = _starpu_mpi_early_data_handle_list_pop_front(&hash->list);
+	STARPU_ASSERT(early->node_tag.node.comm == node_tag[1].node.comm && early->node_tag.node.rank == node_tag[1].node.rank && early->node_tag.data_tag == node_tag[1].data_tag);
+	STARPU_ASSERT(_starpu_mpi_early_data_handle_list_size(&hash->list) == 0);
+
+	early = _starpu_mpi_early_data_find(&node_tag[0]);
+	STARPU_ASSERT(early->node_tag.node.comm == node_tag[0].node.comm && early->node_tag.node.rank == node_tag[0].node.rank && early->node_tag.data_tag == node_tag[0].data_tag);
+}
+
+void early_request()
+{
+	struct _starpu_mpi_req req[2];
+	struct _starpu_mpi_req *early;
+	struct _starpu_mpi_early_request_tag_hashlist *hash;
+
+	memset(&req[0].node_tag, 0, sizeof(struct _starpu_mpi_node_tag));
+	req[0].node_tag.node.rank = 1;
+	req[0].node_tag.node.comm = MPI_COMM_WORLD;
+	req[0].node_tag.data_tag = 42;
+
+	memset(&req[1].node_tag, 0, sizeof(struct _starpu_mpi_node_tag));
+	req[1].node_tag.node.rank = 2;
+	req[1].node_tag.node.comm = MPI_COMM_WORLD;
+	req[1].node_tag.data_tag = 84;
+
+	_starpu_mpi_early_request_enqueue(&req[1]);
+	_starpu_mpi_early_request_enqueue(&req[0]);
+
+	early = _starpu_mpi_early_request_dequeue(req[0].node_tag.data_tag, req[0].node_tag.node.rank, req[0].node_tag.node.comm);
+	STARPU_ASSERT(early->node_tag.data_tag == req[0].node_tag.data_tag && early->node_tag.node.rank == req[0].node_tag.node.rank && early->node_tag.node.comm == req[0].node_tag.node.comm);
+
+	hash = _starpu_mpi_early_request_extract(req[1].node_tag.data_tag, req[1].node_tag.node.rank, req[1].node_tag.node.comm);
+	STARPU_ASSERT(_starpu_mpi_req_list_size(&hash->list) == 1);
+	early = _starpu_mpi_req_list_pop_front(&hash->list);
+	STARPU_ASSERT(_starpu_mpi_req_list_size(&hash->list) == 0);
+	STARPU_ASSERT(early->node_tag.data_tag == req[1].node_tag.data_tag && early->node_tag.node.rank == req[1].node_tag.node.rank && early->node_tag.node.comm == req[1].node_tag.node.comm);
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size, i;
+	starpu_data_handle_t tab_handle[4];
+	int mpi_init;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	early_data();
+	early_request();
+
+	starpu_mpi_shutdown();
+
+	if (!mpi_init)
+		MPI_Finalize();
+	return 0;
+}
+
+#endif

+ 5 - 10
socl/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2012,2013,2015,2017,2018                 CNRS
+# Copyright (C) 2012,2013,2015,2017,2018,2019            CNRS
 # Copyright (C) 2011-2013,2015                           Université de Bordeaux
 # Copyright (C) 2011,2012                                Inria
 #
@@ -15,6 +15,8 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
+include $(top_srcdir)/starpu.mk
+
 SUBDIRS = src examples
 
 EXTRA_DIST = README
@@ -22,16 +24,9 @@ EXTRA_DIST = README
 SOCL_vendorsdir = @datarootdir@/starpu/opencl/vendors
 dist_SOCL_vendors_DATA = @SOCL_VENDORS@
 
-showcheck:
-	RET=0 ; \
-	for i in $(SUBDIRS) ; do \
-		make -C $$i showcheck || RET=1 ; \
-	done ; \
-	exit $$RET
-
-showsuite:
+recheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \
-		make -C $$i showsuite || RET=1 ; \
+		make -C $$i recheck || RET=1 ; \
 	done ; \
 	exit $$RET

+ 7 - 1
socl/src/Makefile.am

@@ -15,7 +15,6 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-include $(top_srcdir)/starpu.mk
 
 CLEANFILES = *.gcno *.gcda
 
@@ -136,3 +135,10 @@ libsocl_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
   cl_geteventprofilinginfo.c \
   cl_getextensionfunctionaddress.c \
   cl_icdgetplatformidskhr.c
+
+recheck:
+	-cat /dev/null
+showcheck:
+	-cat /dev/null
+showsuite:
+	-cat /dev/null

+ 12 - 1
src/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011-2017                                Inria
+# Copyright (C) 2011-2017,2019                           Inria
 # Copyright (C) 2009-2019                                Université de Bordeaux
 # Copyright (C) 2010-2015,2017,2018,2019                 CNRS
 # Copyright (C) 2013                                     Simon Archipoff
@@ -128,6 +128,7 @@ noinst_HEADERS = 						\
 	common/rbtree_i.h					\
 	common/prio_list.h					\
 	common/graph.h						\
+	common/knobs.h						\
 	drivers/driver_common/driver_common.h			\
 	drivers/mp_common/mp_common.h				\
 	drivers/mp_common/source_common.h			\
@@ -170,6 +171,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	common/rbtree.c						\
 	common/graph.c						\
 	common/inlines.c					\
+	common/knobs.c						\
 	core/jobs.c						\
 	core/task.c						\
 	core/task_bundle.c					\
@@ -282,14 +284,19 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/component_prio.c 				\
 	sched_policies/component_random.c				\
 	sched_policies/component_eager.c				\
+	sched_policies/component_eager_prio.c				\
 	sched_policies/component_eager_calibration.c				\
 	sched_policies/component_mct.c				\
 	sched_policies/component_heft.c				\
+	sched_policies/component_heteroprio.c				\
 	sched_policies/component_best_implementation.c		\
 	sched_policies/component_perfmodel_select.c				\
 	sched_policies/component_composed.c				\
 	sched_policies/component_work_stealing.c				\
+	sched_policies/component_stage.c				\
+	sched_policies/component_userchoice.c				\
 	sched_policies/modular_eager.c				\
+	sched_policies/modular_eager_prio.c				\
 	sched_policies/modular_eager_prefetching.c				\
 	sched_policies/modular_gemm.c				\
 	sched_policies/modular_prio.c				\
@@ -300,6 +307,8 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/modular_parallel_heft.c			\
 	sched_policies/modular_heft.c				\
 	sched_policies/modular_heft_prio.c			\
+	sched_policies/modular_heteroprio.c			\
+	sched_policies/modular_heteroprio_heft.c		\
 	sched_policies/modular_heft2.c				\
 	sched_policies/modular_ws.c				\
 	sched_policies/modular_ez.c
@@ -408,6 +417,8 @@ dist-hook:
 	done ; \
 	[ $$failed == 0 ]
 
+recheck:
+	-cat /dev/null
 showcheck:
 	-cat /dev/null
 showsuite:

+ 0 - 0
src/common/fxt.c


Some files were not shown because too many files changed in this diff