Browse Source

Merge branch 'master' of gitlab.inria.fr:starpu/starpu into python

HE Kun 5 years ago
parent
commit
710f74edd9
100 changed files with 5508 additions and 287 deletions
  1. 38 7
      ChangeLog
  2. 12 0
      README.dev
  3. 42 34
      configure.ac
  4. 4 4
      doc/doxygen/Makefile.am
  5. 64 5
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  6. 27 3
      doc/doxygen/chapters/501_environment_variables.doxy
  7. 1754 0
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.eps
  8. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.pdf
  9. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.png
  10. 1388 0
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps
  11. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf
  12. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png
  13. 1416 0
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.eps
  14. BIN
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.pdf
  15. BIN
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.png
  16. 3 3
      doc/doxygen/doxygen.cfg
  17. 4 5
      doc/doxygen_dev/Makefile.am
  18. 7 7
      doc/doxygen_dev/chapters/010_core.doxy
  19. 65 0
      doc/doxygen_dev/dev/starpu_check_missing.sh
  20. 29 1
      doc/doxygen_dev/doxygen-config.cfg.in
  21. 5 3
      doc/doxygen_dev/doxygen.cfg
  22. 30 1
      doc/doxygen_dev/refman.tex
  23. 6 5
      examples/Makefile.am
  24. 7 3
      examples/axpy/axpy_opencl.c
  25. 4 0
      examples/axpy/axpy_opencl_kernel.cl
  26. 8 5
      examples/cholesky/cholesky_implicit.c
  27. 8 4
      examples/lu/lu.sh
  28. 19 7
      examples/lu/lu_example.c
  29. 13 1
      examples/mlr/mlr.c
  30. 31 27
      examples/mult/sgemm.sh
  31. 16 0
      examples/native_fortran/nf_vector.f90
  32. 29 5
      examples/scheduler/schedulers.sh
  33. 4 3
      examples/stencil/Makefile.am
  34. 57 0
      include/fstarpu_mod.f90
  35. 14 0
      include/starpu.h
  36. 1 1
      include/starpu_clusters.h
  37. 5 5
      include/starpu_config.h.in
  38. 3 1
      include/starpu_data.h
  39. 4 0
      include/starpu_fxt.h
  40. 15 1
      include/starpu_task.h
  41. 3 0
      include/starpu_worker.h
  42. 3 3
      julia/examples/Makefile.am
  43. 3 3
      julia/examples/old_examples/mult/mult.c
  44. 2 2
      julia/src/Makefile.am
  45. 2 2
      julia/src/dynamic_compiler/Makefile.am
  46. 45 0
      mpi/GNUmakefile.in
  47. 0 18
      mpi/Makefile.am
  48. 3 4
      mpi/examples/Makefile.am
  49. 27 6
      mpi/examples/benchs/abstract_sendrecv_bench.c
  50. 1 1
      mpi/examples/benchs/abstract_sendrecv_bench.h
  51. 21 6
      mpi/examples/benchs/sendrecv_bench.c
  52. 1 1
      mpi/examples/benchs/sendrecv_gemm_bench.c
  53. 0 1
      mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
  54. 1 1
      mpi/examples/matrix_mult/mm.c
  55. 4 1
      mpi/examples/mpi_lu/plu_example.c
  56. 4 1
      mpi/examples/mpi_lu/plu_implicit_example.c
  57. 4 1
      mpi/examples/mpi_lu/plu_outofcore_example.c
  58. 9 9
      mpi/examples/mpi_lu/pxlu.c
  59. 4 5
      mpi/src/Makefile.am
  60. 22 20
      mpi/src/load_balancer/policy/data_movements_interface.c
  61. 10 8
      mpi/src/load_balancer/policy/data_movements_interface.h
  62. 4 2
      mpi/src/load_balancer/policy/load_balancer_policy.h
  63. 11 9
      mpi/src/load_balancer/policy/load_data_interface.h
  64. 6 6
      mpi/src/load_balancer/policy/load_heat_propagation.c
  65. 2 0
      mpi/src/mpi/starpu_mpi_comm.h
  66. 2 0
      mpi/src/mpi/starpu_mpi_driver.h
  67. 2 0
      mpi/src/mpi/starpu_mpi_early_data.h
  68. 2 0
      mpi/src/mpi/starpu_mpi_early_request.h
  69. 7 0
      mpi/src/mpi/starpu_mpi_mpi.c
  70. 2 0
      mpi/src/mpi/starpu_mpi_mpi.h
  71. 3 1
      mpi/src/mpi/starpu_mpi_mpi_backend.h
  72. 2 0
      mpi/src/mpi/starpu_mpi_sync_data.h
  73. 2 0
      mpi/src/mpi/starpu_mpi_tag.h
  74. 9 0
      mpi/src/nmad/starpu_mpi_nmad.c
  75. 2 0
      mpi/src/nmad/starpu_mpi_nmad.h
  76. 3 1
      mpi/src/nmad/starpu_mpi_nmad_backend.h
  77. 2 0
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h
  78. 14 12
      mpi/src/starpu_mpi.c
  79. 2 0
      mpi/src/starpu_mpi_cache.h
  80. 2 0
      mpi/src/starpu_mpi_cache_stats.h
  81. 1 1
      mpi/src/starpu_mpi_datatype.c
  82. 2 0
      mpi/src/starpu_mpi_datatype.h
  83. 2 0
      mpi/src/starpu_mpi_fxt.h
  84. 2 0
      mpi/src/starpu_mpi_init.h
  85. 2 0
      mpi/src/starpu_mpi_private.c
  86. 8 3
      mpi/src/starpu_mpi_private.h
  87. 2 0
      mpi/src/starpu_mpi_select_node.h
  88. 2 0
      mpi/src/starpu_mpi_stats.h
  89. 5 5
      mpi/src/starpu_mpi_task_insert.c
  90. 2 0
      mpi/src/starpu_mpi_task_insert.h
  91. 5 3
      mpi/tests/Makefile.am
  92. 84 0
      mpi/tests/insert_task_tags.c
  93. 1 1
      mpi/tests/ring.c
  94. 1 1
      mpi/tests/ring_async.c
  95. 1 1
      mpi/tests/ring_async_implicit.c
  96. 1 1
      mpi/tests/ring_sync.c
  97. 1 1
      mpi/tests/ring_sync_detached.c
  98. 3 3
      mpi/tests/user_defined_datatype.c
  99. 3 2
      mpi/tools/Makefile.am
  100. 0 0
      sc_hypervisor/examples/Makefile.am

+ 38 - 7
ChangeLog

@@ -43,13 +43,50 @@ New features:
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
   * Add starpu_data_release_to() and starpu_data_release_to_on_node().
+  * Add profiling based on papi performance counters.
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
 
-StarPU 1.3.5 (git revision xxx)
+StarPU 1.3.8
 ====================================================================
 
+Small features:
+  * A codelet can now define a callback function pointer which will be
+    automatically called when the task does not define itself a
+    callback function, in that case, it can still be called from the
+    task callback function.
+
+StarPU 1.3.7
+====================================================================
+
+Small changes:
+  * Simgrid: bug fix for setting network/weight-S to 0.0
+
+StarPU 1.3.6 (git revision fb9fbed81410d9f0ebbff5bdad1352df4705efe8)
+====================================================================
+
+Small features:
+  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
+    exponential backoff limits of the number of cycles to pause while drivers
+    are spinning.
+  * Add STARPU_DISPLAY_BINDINGS environment variable and
+    starpu_display_bindings() function to display all bindings on the machine by
+    calling hwloc-ps
+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
+    its OS index.
+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
+    StarPU.
+
+StarPU 1.3.5 (git revision 5f7458799f548026fab357b18541bb462dde2b53)
+====================================================================
+
+Small features:
+  * New environment variable STARPU_FXT_SUFFIX to set the filename in
+    which to save the fxt trace
+  * New option -d for starpu_fxt_tool to specify in which directory to
+    generate files
+
 Small changes:
   * Move MPI cache functions into the public API
   * Add STARPU_MPI_NOBIND environment variable.
@@ -80,12 +117,6 @@ Small features:
     starpu_mpi_interface_datatype_unregister() which take a enum
     starpu_data_interface_id instead of a starpu_data_handle_t
   * New script starpu_env to set up StarPU environment variables
-  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
-    exponential backoff limits of the number of cycles to pause while drivers
-    are spinning.
-  * Add STARPU_DISPLAY_BINDINGS environment variable and
-    starpu_display_bindings() function to display all bindings on the machine by
-    calling hwloc-ps
 
 Small changes:
   * New configure option --disable-build-doc-pdf

+ 12 - 0
README.dev

@@ -101,3 +101,15 @@ Error handling
   --enable-fast.
 
 	STARPU_ASSERT(j->terminated != 0)
+
+
+
+Makefile.am
+-----------
+
+Dependency libraries are appended to LIBS.
+Only real LDFLAGS such as -no-undefined go to LDFLAGS.
+
+If a program foo needs more libraries, it can put then in foo_LDADD.
+
+(No, AM_LDADD does not exist)

+ 42 - 34
configure.ac

@@ -152,14 +152,14 @@ if test x$enable_simgrid = xyes ; then
 	PKG_CHECK_MODULES([SIMGRID], [simgrid], [], [:])
 
 	if test "$simgrid_include_dir" != "no" ; then
-	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_include_dir"
+		SIMGRID_CFLAGS="-I$simgrid_include_dir $SIMGRID_CFLAGS"
 	fi
 	if test "$simgrid_lib_dir" != "no" ; then
-	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_lib_dir"
+		SIMGRID_LIBS="-L$simgrid_lib_dir $SIMGRID_LIBS"
 	fi
 	if test "$simgrid_dir" != "no" ; then
-	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_dir/include"
-	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_dir/lib"
+		SIMGRID_CFLAGS="-I$simgrid_dir/include $SIMGRID_CFLAGS"
+		SIMGRID_LIBS="-L$simgrid_dir/lib $SIMGRID_LIBS"
 	fi
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
@@ -226,9 +226,9 @@ if test x$enable_simgrid = xyes ; then
 		LIBS="$LIBS -lstdc++"
 	fi
 
-	case \ $CXXFLAGS\  in 
+	case \ $CXXFLAGS\  in
 	*\ -std=*\ *) ;;
-	*) 
+	*)
 		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 				  #ifdef STARPU_HAVE_SIMGRID_MSG_H
 				  #include <simgrid/msg.h>
@@ -248,6 +248,8 @@ if test x$enable_simgrid = xyes ; then
 		AC_DEFINE(STARPU_SIMGRID_MC, [1], [Define this to enable Model Checker in simgrid execution])
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
+		# libsimgrid needs to be linked from binaries themselves for MC to work
+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
 	fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
@@ -448,6 +450,10 @@ if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
     build_nmad_lib=yes
     build_mpi_lib=no
     PKG_CHECK_MODULES([NMAD],[nmad])
+    AC_CHECK_DECL([piom_ltask_set_bound_thread_os_indexes], have_piom_ltask_set_bound_thread_os_indexes=yes, have_piom_ltask_set_bound_thread_os_indexes=no, [[#include <pioman.h>]])
+    if test x$have_piom_ltask_set_bound_thread_os_indexes = xyes; then
+      AC_DEFINE(HAVE_PIOM_LTASK_SET_BOUND_THREAD_OS_INDEXES, [1], [piom_ltask_set_bound_thread_os_indexes is availabe])
+    fi
 else
     build_nmad_lib=no
 fi
@@ -1298,9 +1304,9 @@ if test x$enable_cuda = xyes; then
 	STARPU_CUFFT_LDFLAGS="-lcufft"
 
 	AC_LANG_PUSH([C++])
-	case \ $NVCCFLAGS\  in 
+	case \ $NVCCFLAGS\  in
 	*\ -std=*\ *) ;;
-	*) 
+	*)
 		SAVED_CXX="$CXX"
 		CXX="$NVCC"
 		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
@@ -1323,7 +1329,7 @@ if test x$enable_cuda = xyes; then
 
 	#in case this is a 64bit setup, we tell nvcc to use a -m64 flag, if missing from existing flags
 	if test x$SIZEOF_VOID_P = x8; then
-		case \ $NVCCFLAGS\  in 
+		case \ $NVCCFLAGS\  in
 			*\ -m64\ *) ;;
 			*) NVCCFLAGS="${NVCCFLAGS} -m64" ;;
 		esac
@@ -2079,8 +2085,10 @@ if test x$use_fxt = xyes; then
 	LIBS="$LIBS $FXT_LIBS"
 	save_LDFLAGS="$LDFLAGS"
 	LDFLAGS="$LDFLAGS $FXT_LDFLAGS"
-   	AC_CHECK_FUNCS([enable_fut_flush])
-   	AC_CHECK_FUNCS([fut_set_filename])
+	AC_CHECK_FUNCS([fxt_close])
+	AC_CHECK_FUNCS([fxt_blockev_leave])
+	AC_CHECK_FUNCS([enable_fut_flush])
+	AC_CHECK_FUNCS([fut_set_filename])
 	AC_CHECK_FUNCS([fut_setup_flush_callback])
 	LDFLAGS="$save_LDFLAGS"
 	LIBS="$save_LIBS"
@@ -2705,7 +2713,7 @@ if test "x$use_mpi_master_slave" = "xyes" ; then
       AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
    fi
    if test "x$enable_socl" = "xmaybe" ; then
-     enable_socl=no 
+     enable_socl=no
    fi
 fi
 
@@ -3271,9 +3279,9 @@ then
 			[AS_HELP_STRING([--with-dlb-include-dir=<path>],
 			[specify where DLB headers are installed])],
 			[dlb_inc_dirs="$withval"], [dlb_inc_dirs=""])
-	
+
 		dlb_inc_dirs="${dlb_inc_dirs} /usr/include/dlb"
-	
+
 		dlb_incdir_found=no
 		for dlb_incdir in $dlb_inc_dirs
 		do
@@ -3294,18 +3302,18 @@ then
 				unset ac_cv_header_dlb_h
 			fi
 		done
-	
+
 		AC_ARG_WITH(dlb-lib-dir,
 			[AS_HELP_STRING([--with-dlb-lib-dir=<path>],
 			[specify where DLB libraries are installed])],
 			[dlb_lib_dirs="$withval"], [dlb_lib_dirs=""])
-	
+
 		dlb_lib_dirs="${dlb_lib_dirs} /usr/lib/dlb"
-	
+
 		dlb_libdir_found=no
 		for dlb_libdir in $dlb_lib_dirs
 		do
-			if test -n "$dlb_libdir" 
+			if test -n "$dlb_libdir"
 			then
 				SAVED_LDFLAGS="${LDFLAGS}"
 				LDFLAGS=-L${dlb_libdir}
@@ -3322,7 +3330,7 @@ then
 				unset ac_cv_lib_dlb_DLB_Init
 			fi
 		done
-	
+
 		SAVED_CPPFLAGS="${CPPFLAGS}"
 		SAVED_CFLAGS="${CFLAGS}"
 		SAVED_LDFLAGS="${LDFLAGS}"
@@ -3334,7 +3342,7 @@ then
 		CPPFLAGS="$SAVED_CPPFLAGS"
 		CFLAGS="$SAVED_CFLAGS"
 		LIBS="$SAVED_LIBS"
-	
+
 		SAVED_CPPFLAGS="${CPPFLAGS}"
 		SAVED_CFLAGS="${CFLAGS}"
 		SAVED_LDFLAGS="${LDFLAGS}"
@@ -3350,13 +3358,13 @@ then
 		CPPFLAGS="$SAVED_CPPFLAGS"
 		CFLAGS="$SAVED_CFLAGS"
 		LIBS="$SAVED_LIBS"
-	
+
 		if test "x$dlb_incdir_found" != "xyes" -o "x$dlb_libdir_found" != "xyes"
 		then
 			enable_dlb=no
 		fi
 	fi
-	
+
 	AC_MSG_CHECKING(whether DLB support should be enabled)
 	AC_MSG_RESULT($enable_dlb)
 	if test "x$enable_dlb" != "xno"
@@ -3386,34 +3394,31 @@ AC_ARG_ENABLE(starpurm-examples, [AS_HELP_STRING([--enable-starpurm-examples],
 			enable_starpurm_examples=$enableval, enable_starpurm_examples=no)
 AM_CONDITIONAL(STARPU_BUILD_STARPURM_EXAMPLES, [test x$enable_starpurm_examples = xyes])
 
-
-
 ##########################################
 # Documentation                          #
 ##########################################
 
 def_enable_build_doc="yes"
-def_enable_build_doc_pdf="no"
 available_doc="no"
 if test -d "$srcdir/doc/doxygen/html" ; then
    def_enable_build_doc="no"
    available_doc="yes"
 fi
-available_doc_pdf="no"
-if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
-   def_enable_build_doc="no"
-   def_enable_build_doc_pdf="no"
-   available_doc_pdf="yes"
-fi
 
 AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
-			enable_build_doc=$enableval, enable_build_doc=$enable_build_doc)
+			enable_build_doc=$enableval, enable_build_doc=$def_enable_build_doc)
 
 AC_ARG_ENABLE(build-doc-pdf, [AS_HELP_STRING([--enable-build-doc-pdf],
 			[enable building of PDF documentation])],
 			enable_build_doc_pdf=$enableval, enable_build_doc_pdf=$def_enable_build_doc_pdf)
 
+available_doc_pdf="no"
+if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
+   enable_build_doc_pdf="no"
+   available_doc_pdf="yes"
+fi
+
 # Check whether doxygen needed tools are installed
 AC_PATH_PROG(doxygencommand, doxygen)
 if test "$doxygencommand" = "" ; then
@@ -3476,7 +3481,6 @@ LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
-AC_SUBST([LIBSTARPU_LINK])
 
 if test "x$enable_shared" = xno; then
         # No .so, so application will unexpected have to know which -l to
@@ -3484,6 +3488,8 @@ if test "x$enable_shared" = xno; then
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
 	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
 fi
+LIBSTARPU_LINK="$LIBSTARPU_LINK $STARPU_EXPORTED_LIBS"
+AC_SUBST([LIBSTARPU_LINK])
 AC_SUBST(STARPU_EXPORTED_LIBS)
 
 # File configuration
@@ -3622,6 +3628,7 @@ AC_OUTPUT([
 	examples/stencil/Makefile
 	tests/Makefile
 	tests/loader-cross.sh
+	tests/model-checking/Makefile
 	tests/model-checking/starpu-mc.sh
 	examples/loader-cross.sh
 	examples/stencil/loader-cross.sh
@@ -3630,6 +3637,7 @@ AC_OUTPUT([
 	mpi/tests/Makefile
 	mpi/examples/Makefile
 	mpi/tools/Makefile
+	mpi/GNUmakefile
 	sc_hypervisor/Makefile
 	sc_hypervisor/src/Makefile
 	sc_hypervisor/examples/Makefile
@@ -3689,7 +3697,7 @@ AC_MSG_NOTICE([
 	       MPI test suite:                                $running_mpi_check
 	       Master-Slave MPI enabled:                      $use_mpi_master_slave
 	       FFT Support:                                   $fft_support
-	       Resource Management enable:                    $starpurm_support
+	       Resource Management enabled:                   $starpurm_support
 	       OpenMP runtime support enabled:                $enable_openmp
 	       Cluster support enabled:                       $enable_cluster
 	       SOCL enabled:                                  $build_socl

+ 4 - 4
doc/doxygen/Makefile.am

@@ -146,6 +146,10 @@ images = 	\
 	chapters/images/temanejo.png
 
 if STARPU_BUILD_DOC
+EXTRA_DIST += \
+	      $(top_srcdir)/doc/doxygen/chapters/version.sty \
+	      $(top_srcdir)/doc/doxygen/chapters/version.html
+
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
@@ -302,8 +306,4 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
-PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
-update-web: $(DOX_PDF)
-	cp -pr starpu.pdf html $(PUBLISHDIR)
 

+ 64 - 5
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -84,7 +84,11 @@ Or you can simply point the <c>PKG_CONFIG_PATH</c> to
 When FxT is enabled, a trace is generated when StarPU is terminated by calling
 starpu_shutdown(). The trace is a binary file whose name has the form
 <c>prof_file_XXX_YYY</c> where <c>XXX</c> is the user name, and
-<c>YYY</c> is the pid of the process that used StarPU. This file is saved in the
+<c>YYY</c> is the MPI id of the process that used StarPU (or 0 when running a sequential program).
+One can change
+the name of the file by setting the environnement variable \ref
+STARPU_FXT_SUFFIX, its contents will be used instead of <c>prof_file_XXX</c>.
+This file is saved in the
 <c>/tmp/</c> directory by default, or by the directory specified by
 the environment variable \ref STARPU_FXT_PREFIX.
 
@@ -104,8 +108,10 @@ $ starpu_fxt_tool -i /tmp/prof_file_something
 \endverbatim
 
 Or alternatively, setting the environment variable \ref STARPU_GENERATE_TRACE
-to <c>1</c> before application execution will make StarPU do it automatically at
-application shutdown.
+to <c>1</c> before application execution will make StarPU
+automatically generate all traces at application shutdown. Note that
+if the environment variable \ref STARPU_FXT_PREFIX is set, files will
+be generated in the given directory.
 
 One can also set the environment variable \ref
 STARPU_GENERATE_TRACE_OPTIONS to specify options, see
@@ -394,7 +400,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
 An XML output can also be printed by using the <c>-x</c> option:
 \verbatim
-tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based 
+$ tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based 
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE StarPUPerfmodel SYSTEM "starpu-perfmodel.dtd">
 <!-- symbol non_linear_memset_regression_based -->
@@ -419,6 +425,12 @@ The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 models. It writes a <c>.gp</c> file in the current directory, to be
 run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
+\verbatim
+$ tools/starpu_perfmodel_plot -s non_linear_memset_regression_based 
+$ gnuplot starpu_non_linear_memset_regression_based.gp
+$ gv starpu_non_linear_memset_regression_based.eps
+\endverbatim
+
 \image html starpu_non_linear_memset_regression_based.png
 \image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
 
@@ -476,6 +488,53 @@ histogram of the codelet execution time distribution.
 \image html distrib_data_histo.png
 \image latex distrib_data_histo.eps "" width=\textwidth
 
+\section EnergyOfCodelets Energy Of Codelets
+
+A performance model of the energy of codelets can also be recorded thanks to
+the starpu_codelet::energy_model field of the starpu_codelet structure. StarPU usually cannot
+record this automatically since the energy measurement probes are usually not
+fine-grain enough.  It is however possible to measure it by writing a program
+that submits batches of tasks, let StarPU measure the energy requirement of
+the batch, and compute an average, see \ref MeasuringEnergyandPower .
+
+The energy performance model can then be displayed in Joules with
+<c>starpu_perfmodel_display</c> just like the time performance model.  The
+<c>starpu_perfmodel_plot</c> needs an extra <c>-e</c> option to display the
+proper unit in the graph:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -e -s non_linear_memset_regression_based_energy
+$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
+$ gv starpu_non_linear_memset_regression_based_energy.eps
+\endverbatim
+
+\image html starpu_non_linear_memset_regression_based_energy.png
+\image latex starpu_non_linear_memset_regression_based_energy.eps "" width=\textwidth
+
+The <c>-f</c> option can also be used to display the performance in terms of GFlop/s/W, i.e. the efficiency:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
+$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
+$ gv starpu_non_linear_memset_regression_based_energy.eps
+\endverbatim
+
+\image html starpu_non_linear_memset_regression_based_energy_flops.png
+\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
+
+We clearly see here that it is much more energy-efficient to stay in the L3 cache.
+
+One can combine the two time and energy performance models to draw Watts:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -se non_linear_memset_regression_based non_linear_memset_regression_based_energy
+$ gnuplot starpu_power_non_linear_memset_regression_based.gp
+$ gv starpu_power_non_linear_memset_regression_based.eps
+\endverbatim
+
+\image html starpu_power_non_linear_memset_regression_based.png
+\image latex starpu_power_non_linear_memset_regression_based.eps "" width=\textwidth
+
 \section DataTrace Data trace and tasks length
 
 It is possible to get statistics about tasks length and data size by using :
@@ -545,7 +604,7 @@ S: Start time
 Here's an example on how to use it:
 
 \verbatim
-$ python starpu_trace_state_stats.py trace.rec | column -t -s ","
+$ starpu_trace_state_stats.py trace.rec | column -t -s ","
 "Name"		"Count" "Type"	"Duration"
 "Callback"       220	Runtime	0.075978
 "chol_model_11"  10	Task	565.176

+ 27 - 3
doc/doxygen/chapters/501_environment_variables.doxy

@@ -69,7 +69,8 @@ create as many CUDA workers as there are CUDA devices.
 \anchor STARPU_NWORKER_PER_CUDA
 \addindex __env__STARPU_NWORKER_PER_CUDA
 Specify the number of workers per CUDA device, and thus the number of kernels
-which will be concurrently running on the devices. The default value is 1.
+which will be concurrently running on the devices, i.e. the number of CUDA
+streams. The default value is 1.
 </dd>
 
 <dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
@@ -924,6 +925,22 @@ has been configured with the option \ref enable-verbose "--enable-verbose". Also
 disable the display of StarPU information and warning messages.
 </dd>
 
+<dt>STARPU_MPI_DEBUG_LEVEL_MIN</dt>
+<dd>
+\anchor STARPU_MPI_DEBUG_LEVEL_MIN
+\addindex __env__STARPU_MPI_DEBUG_LEVEL_MIN
+Set the minimum level of debug when StarPU
+has been configured with the option \ref enable-mpi-verbose "--enable-mpi-verbose".
+</dd>
+
+<dt>STARPU_MPI_DEBUG_LEVEL_MAX</dt>
+<dd>
+\anchor STARPU_MPI_DEBUG_LEVEL_MAX
+\addindex __env__STARPU_MPI_DEBUG_LEVEL_MAX
+Set the maximum level of debug when StarPU
+has been configured with the option \ref enable-mpi-verbose "--enable-mpi-verbose".
+</dd>
+
 <dt>STARPU_LOGFILENAME</dt>
 <dd>
 \anchor STARPU_LOGFILENAME
@@ -935,14 +952,21 @@ Specify in which file the debugging output should be saved to.
 <dd>
 \anchor STARPU_FXT_PREFIX
 \addindex __env__STARPU_FXT_PREFIX
-Specify in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
+Specify in which directory to save the generated trace if FxT is enabled.
+</dd>
+
+<dt>STARPU_FXT_SUFFIX</dt>
+<dd>
+\anchor STARPU_FXT_SUFFIX
+\addindex __env__STARPU_FXT_SUFFIX
+Specify in which file to save the generated trace if FxT is enabled.
 </dd>
 
 <dt>STARPU_FXT_TRACE</dt>
 <dd>
 \anchor STARPU_FXT_TRACE
 \addindex __env__STARPU_FXT_TRACE
-Specify whether to generate (1) or not (0) the FxT trace in /tmp/prof_file_XXX_YYY . The default is 1 (generate it)
+Specify whether to generate (1) or not (0) the FxT trace in /tmp/prof_file_XXX_YYY (the directory and file name can be changed with \ref STARPU_FXT_PREFIX and \ref STARPU_FXT_SUFFIX). The default is 1 (generate it)
 </dd>
 
 <dt>STARPU_LIMIT_CUDA_devid_MEM</dt>

File diff suppressed because it is too large
+ 1754 - 0
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.eps


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.pdf


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.png


File diff suppressed because it is too large
+ 1388 - 0
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png


File diff suppressed because it is too large
+ 1416 - 0
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.eps


BIN
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.pdf


BIN
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.png


+ 3 - 3
doc/doxygen/doxygen.cfg

@@ -365,7 +365,7 @@ TYPEDEF_HIDES_STRUCT   = NO
 # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols.
 
-SYMBOL_CACHE_SIZE      = 0
+#SYMBOL_CACHE_SIZE      = 0
 
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
@@ -1502,13 +1502,13 @@ XML_OUTPUT             = xml
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             =
+#XML_SCHEMA             =
 
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                =
+#XML_DTD                =
 
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting

+ 4 - 5
doc/doxygen_dev/Makefile.am

@@ -67,6 +67,10 @@ chapters =	\
 images =
 
 if STARPU_BUILD_DOC
+EXTRA_DIST += \
+	      $(top_srcdir)/doc/doxygen_dev/chapters/version.sty \
+	      $(top_srcdir)/doc/doxygen_dev/chapters/version.html
+
 config.h: $(top_srcdir)/src/common/config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
@@ -245,8 +249,3 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
-PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
-update-web: $(DOX_PDF)
-	cp -pr starpu_dev.pdf html_dev $(PUBLISHDIR)
-

+ 7 - 7
doc/doxygen_dev/chapters/010_core.doxy

@@ -211,7 +211,7 @@ the application code on a thread launched by the application, or automatically
 by StarPU on a device-dependent CPU thread launched by StarPU. Whether a
 worker's operation cycle is managed automatically or
 not is controlled per session by the field \c not_launched_drivers of the \c
-starpu_conf struct, and is decided in \ref _starpu_launch_drivers() function.
+starpu_conf struct, and is decided in \c _starpu_launch_drivers function.
 
 When managed automatically, cycles of operations for a worker are handled by the corresponding
 driver specific <code>_starpu_<DRV>_worker()</code> function, where \c DRV is a driver name such as
@@ -220,7 +220,7 @@ Otherwise, the application must supply a thread which will repeatedly call \ref
 starpu_driver_run_once() for the corresponding worker.
 
 In both cases, control is then transferred to 
-\ref _starpu_cpu_driver_run_once() (or the corresponding driver specific func).
+\c _starpu_cpu_driver_run_once (or the corresponding driver specific func).
 The cycle of operations typically includes, at least, the following operations:
 
 - <b>task scheduling</b>
@@ -270,7 +270,7 @@ driving) progress, with a call to \ref __starpu_datawizard_progress(),
 
 Once the worker has a pending task assigned and the input data for that task are
 available in the memory node reachable by the worker's computing unit, the
-worker calls \ref _starpu_cpu_driver_execute_task() (or the corresponding driver
+worker calls \c _starpu_cpu_driver_execute_task (or the corresponding driver
 specific function) to proceed to the execution of the task.
 
 
@@ -312,12 +312,12 @@ writing.
 When the set of workers assigned to a scheduling context is about to be
 modified, all the workers in the union between the workers belonging to the
 scheduling context before the change and the workers expected to belong to the
-scheduling context after the change must be notified using the \ref
-notify_workers_about_changing_ctx_pending() function prior to the update. After
+scheduling context after the change must be notified using the
+\c notify_workers_about_changing_ctx_pending function prior to the update. After
 the update, all the workers in that same union must be notified for the update
-completion with a call to \ref notify_workers_about_changing_ctx_done().
+completion with a call to \c notify_workers_about_changing_ctx_done.
 
-The function \ref notify_workers_about_changing_ctx_pending() places every
+The function \c notify_workers_about_changing_ctx_pending places every
 worker passed in argument in a state compatible with changing the scheduling
 context assignment of that worker, possibly blocking until that worker leaves
 incompatible states such as a pending scheduling operation. If the caller of

+ 65 - 0
doc/doxygen_dev/dev/starpu_check_missing.sh

@@ -0,0 +1,65 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+stcolor=$(tput sgr0)
+redcolor=$(tput setaf 1)
+greencolor=$(tput setaf 2)
+
+dirname=$(realpath $(dirname $0))
+
+ok()
+{
+    type=$1
+    name=$2
+    echo "$type ${greencolor}${name}${stcolor} is in doxygen-config.cfg.in"
+}
+
+ko()
+{
+    type=$1
+    name=$2
+    #echo "$type ${redcolor}${name}${stcolor} is missing from doxygen-config.cfg.in"
+    echo $name
+}
+
+for d in src mpi/src starpurm/src
+do
+    cd $dirname/../../../$d
+    for f in $(find -name "*.h")
+    do
+	ff=$(echo $f | cut -b3-)
+	x=$(grep -c $ff $dirname/../doxygen-config.cfg.in)
+	if test "$x" == "0"
+	then
+	    ko file $d/$ff
+	#else
+	#    ok file $d/$ff
+	fi
+    done
+done
+
+cd $dirname/../../../build/doc/doxygen_dev/latex
+for f in $(find -name "*8h.tex")
+do
+    ff=$(basename $(echo $f | cut -b3-) ".tex")
+    x=$(grep -c $ff refman.tex)
+    if test "$x" == "0"
+    then
+	ko file $ff
+    fi
+done
+

+ 29 - 1
doc/doxygen_dev/doxygen-config.cfg.in

@@ -16,6 +16,9 @@
 #
 INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
                          @top_builddir@/doc/doxygen_dev/config.h \
+			 @top_srcdir@/include/starpu_driver.h \
+			 @top_srcdir@/include/starpu_worker.h \
+			 @top_srcdir@/include/starpu_config.h \
 			 @top_srcdir@/src/datawizard/data_request.h \
 			 @top_srcdir@/src/datawizard/coherency.h \
 			 @top_srcdir@/src/datawizard/sort_data_handles.h \
@@ -99,7 +102,32 @@ INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
 			 @top_srcdir@/src/core/errorcheck.h \
 			 @top_srcdir@/src/core/progress_hook.h \
 			 @top_srcdir@/src/core/drivers.h \
-			 @top_srcdir@/src/core/workers.h
+			 @top_srcdir@/src/core/workers.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_init.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_datatype.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_task_insert.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_select_node.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_fxt.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_cache.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_mpi_backend.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_driver.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_early_data.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_comm.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_tag.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_mpi.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_early_request.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_sync_data.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/load_data_interface.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/load_balancer_policy.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/data_movements_interface.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad_backend.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_stats.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_private.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_cache_stats.h \
+			 @top_srcdir@/starpurm/src/starpurm_private.h
+
 
 EXAMPLE_PATH           = @top_srcdir@/doc/doxygen_dev \
 		       	 @top_srcdir@/doc/doxygen/chapters

+ 5 - 3
doc/doxygen_dev/doxygen.cfg

@@ -365,7 +365,7 @@ TYPEDEF_HIDES_STRUCT   = NO
 # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols.
 
-SYMBOL_CACHE_SIZE      = 0
+#SYMBOL_CACHE_SIZE      = 0
 
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
@@ -1502,13 +1502,13 @@ XML_OUTPUT             = xml
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             =
+#XML_SCHEMA             =
 
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                =
+#XML_DTD                =
 
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
@@ -1621,6 +1621,8 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \
 			 STARPU_SIMGRID=1 \
 			 STARPU_OPENMP=1 \
+			 STARPU_USE_MPI_MPI=1 \
+			 STARPU_USE_MPI_NMAD=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 30 - 1
doc/doxygen_dev/refman.tex

@@ -76,7 +76,7 @@ Documentation License”.
 \chapter{File Index}
 \input{files}
 
-\chapter{File Documentation}
+\chapter{StarPU File Documentation}
 \input{barrier_8h}
 \input{barrier__counter_8h}
 \input{bound_8h}
@@ -158,6 +158,35 @@ Documentation License”.
 \input{timing_8h}
 \input{topology_8h}
 \input{utils_8h}
+\input{uthash_8h}
 \input{write__back_8h}
 
+\chapter{StarPU MPI File Documentation}
+\input{starpu__mpi__cache_8h}
+\input{starpu__mpi__driver_8h}
+\input{starpu__mpi__init_8h}
+\input{starpu__mpi__nmad__backend_8h}
+\input{starpu__mpi__stats_8h}
+\input{starpu__mpi__cache__stats_8h}
+\input{starpu__mpi__early__data_8h}
+\input{starpu__mpi__mpi_8h}
+\input{starpu__mpi__nmad__unknown__datatype_8h}
+\input{starpu__mpi__sync__data_8h}
+\input{starpu__mpi__comm_8h}
+\input{starpu__mpi__early__request_8h}
+\input{starpu__mpi__mpi__backend_8h}
+\input{starpu__mpi__private_8h}
+\input{starpu__mpi__tag_8h}
+\input{starpu__mpi__datatype_8h}
+\input{starpu__mpi__fxt_8h}
+\input{starpu__mpi__nmad_8h}
+\input{starpu__mpi__select__node_8h}
+\input{starpu__mpi__task__insert_8h}
+\input{load__balancer__policy_8h}
+\input{load__data__interface_8h}
+\input{data__movements__interface_8h}
+
+\chapter{StarPU Resource Manager File Documentation}
+\input{starpurm__private_8h}
+
 \end{document}

+ 6 - 5
examples/Makefile.am

@@ -20,9 +20,10 @@ include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CXXFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) $(FXT_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 SUBDIRS = stencil
 
@@ -75,7 +76,7 @@ EXTRA_DIST = 					\
 	lu/lu.sh
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 native_fortran/fstarpu_mod.f90
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps */*.mps */*/*.mps *.dot */*.dot */*/*.dot *.pl */*.pl */*/*.pl *.png *.output tasks.rec perfs.rec */perfs.rec */*/perfs.rec perfs2.rec fortran90/starpu_mod.f90 native_fortran/fstarpu_mod.f90
 
 if STARPU_USE_CUDA
 
@@ -165,7 +166,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 if !STARPU_CROSS_COMPILING
 LOADER			=	loader
-loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	=	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
 noinst_PROGRAMS		+=	loader
@@ -1012,7 +1013,7 @@ endif
 examplebin_PROGRAMS +=				\
 	mandelbrot/mandelbrot
 
-mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS)
 if STARPU_HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)

+ 7 - 3
examples/axpy/axpy_opencl.c

@@ -31,7 +31,9 @@ void axpy_opencl(void *buffers[], void *_args)
 
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
 	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	unsigned x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
 	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	unsigned y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
 
 	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
@@ -41,9 +43,11 @@ void axpy_opencl(void *buffers[], void *_args)
 		STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
-	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
-	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);
-	err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha);
+	err|= clSetKernelArg(kernel, 1, sizeof(x_offset), &x_offset);
+	err|= clSetKernelArg(kernel, 2, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 3, sizeof(y_offset), &y_offset);
+	err|= clSetKernelArg(kernel, 4, sizeof(n), &n);
+	err|= clSetKernelArg(kernel, 5, sizeof(*alpha), alpha);
 	if (err)
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 4 - 0
examples/axpy/axpy_opencl_kernel.cl

@@ -19,11 +19,15 @@
 #include "axpy.h"
 
 __kernel void _axpy_opencl(__global TYPE *x,
+			   unsigned x_offset,
 			   __global TYPE *y,
+			   unsigned y_offset,
 			   unsigned nx,
 			   TYPE alpha)
 {
         const int i = get_global_id(0);
+        x = (__global char*) x + x_offset;
+        y = (__global char*) y + y_offset;
         if (i < nx)
                 y[i] = alpha * x[i] + y[i];
 }

+ 8 - 5
examples/cholesky/cholesky_implicit.c

@@ -206,6 +206,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 {
 	float *mat = NULL;
 
+	/*
+	 * create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 *
+	 * and make it better conditioned by adding one on the diagonal.
+	 */
+
 #ifndef STARPU_SIMGRID
 	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
@@ -324,11 +332,6 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 int main(int argc, char **argv)
 {
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
 #ifdef STARPU_HAVE_MAGMA
 	magma_init();
 #endif

+ 8 - 4
examples/lu/lu.sh

@@ -19,6 +19,8 @@
 set -e
 
 PREFIX=$(dirname $0)
+rm -rf $PREFIX/lu.traces
+mkdir -p $PREFIX/lu.traces
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_implicit_example_float
@@ -26,11 +28,13 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float
 fi
 
+export STARPU_FXT_PREFIX=$PREFIX/lu.traces
+
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -bound
-$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $STARPU_FXT_PREFIX
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_example_float
@@ -41,5 +45,5 @@ fi
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound
-$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $PREFIX/lu.traces
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX

+ 19 - 7
examples/lu/lu_example.c

@@ -40,6 +40,7 @@ static unsigned no_prio=0;
 unsigned bound = 0;
 unsigned bounddeps = 0;
 unsigned boundprio = 0;
+char *directory =  NULL;
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
@@ -102,9 +103,13 @@ static void parse_args(int argc, char **argv)
 			bounddeps = 1;
 			boundprio = 1;
 		}
+		else if (strcmp(argv[i], "-directory") == 0)
+		{
+			directory = strdup(argv[++i]);
+		}
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio]\n");
+			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio] [-directory d]\n");
 			fprintf(stderr,"Default is size %lu and nblocks %u\n", size, nblocks);
 			exit(0);
 		}
@@ -420,17 +425,23 @@ int main(int argc, char **argv)
 	{
 		if (bounddeps)
 		{
-			FILE *f = fopen("lu.pl", "w");
+			if (!directory)
+				directory = strdup(".");
+			char filename[256];
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.pl");
+			FILE *f = fopen(filename, "w");
 			starpu_bound_print_lp(f);
-			FPRINTF(stderr,"system printed to lu.pl\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
-			f = fopen("lu.mps", "w");
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.mps");
+			f = fopen(filename, "w");
 			starpu_bound_print_mps(f);
-			FPRINTF(stderr,"system printed to lu.mps\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
-			f = fopen("lu.dot", "w");
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.dot");
+			f = fopen(filename, "w");
 			starpu_bound_print_dot(f);
-			FPRINTF(stderr,"system printed to lu.mps\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
 		}
 	}
@@ -458,6 +469,7 @@ int main(int argc, char **argv)
 	starpu_cublas_shutdown();
 
 	starpu_shutdown();
+	free(directory);
 
 	if (ret == -ENODEV) return 77; else return 0;
 }

+ 13 - 1
examples/mlr/mlr.c

@@ -44,6 +44,12 @@
 #include <stdint.h>
 #include <starpu.h>
 
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 10
+#else
+#define NTASKS 1000
+#endif
+
 static long sum;
 
 /* Performance function of the task, which is in this case very simple, as the parameter values just need to be written in the array "parameters" */
@@ -185,7 +191,7 @@ int main(void)
 		vector_mn[1] = n;
 		starpu_data_release(vector_mn_handle);
 
-		for (j = 0; j < 42; j++)
+		for (j = 0; j < NTASKS; j++)
 		{
 			starpu_insert_task(&cl_init,
 					   STARPU_R, vector_mn_handle,
@@ -202,5 +208,11 @@ int main(void)
 	free(vector_mn);
 	starpu_shutdown();
 
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	starpu_perfmodel_dump_xml(stdout, &cl_model_final);
+	starpu_shutdown();
+
 	return 0;
 }

+ 31 - 27
examples/mult/sgemm.sh

@@ -25,6 +25,8 @@
 set -e
 
 PREFIX=$(dirname $0)
+rm -rf $PREFIX/sgemm.traces
+mkdir -p $PREFIX/sgemm.traces
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/sgemm
@@ -32,46 +34,48 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm
 fi
 
-STARPU_SCHED=dmdas STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/sgemm -check
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -s starpu_sgemm_gemm
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -x -s starpu_sgemm_gemm
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump -o perfs.rec
-[ -f perfs.rec ]
+export STARPU_FXT_PREFIX=$PREFIX/sgemm.traces
+
+STARPU_SCHED=dmdas $PREFIX/sgemm -check
 if [ -x $PREFIX/../../tools/starpu_fxt_tool ];
 then
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_plot -s starpu_sgemm_gemm -i $PREFIX/prof_file_${USER}_0
-	[ -f starpu_starpu_sgemm_gemm.gp -a -f starpu_starpu_sgemm_gemm.data -a -f starpu_starpu_sgemm_gemm.data ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_plot -o $STARPU_FXT_PREFIX -s starpu_sgemm_gemm -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
+	[ -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.gp -a -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.data -a -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.data ]
 
 	# Generate paje, dag, data, etc.
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_tool -memory-states -label-deps -i $PREFIX/prof_file_${USER}_0
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_tool -d $STARPU_FXT_PREFIX -memory-states -label-deps -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
 
-	$PREFIX/../../tools/starpu_paje_sort paje.trace
-	! type pj_dump || pj_dump -e 0 < paje.trace
+	$PREFIX/../../tools/starpu_paje_sort $STARPU_FXT_PREFIX/paje.trace
+	! type pj_dump || pj_dump -e 0 < $STARPU_FXT_PREFIX/paje.trace
 
-	$PREFIX/../../tools/starpu_codelet_profile distrib.data starpu_sgemm_gemm
-	[ -f distrib.data.gp -a \( -f distrib.data.0 -o -f distrib.data.1 -o -f distrib.data.2 -o -f distrib.data.3 -o -f distrib.data.4 \) ]
+	$PREFIX/../../tools/starpu_codelet_profile $STARPU_FXT_PREFIX/distrib.data starpu_sgemm_gemm
+	[ -f $STARPU_FXT_PREFIX/distrib.data.gp -a \( -f $STARPU_FXT_PREFIX/distrib.data.0 -o -f $STARPU_FXT_PREFIX/distrib.data.1 -o -f $STARPU_FXT_PREFIX/distrib.data.2 -o -f $STARPU_FXT_PREFIX/distrib.data.3 -o -f $STARPU_FXT_PREFIX/distrib.data.4 -o -f $STARPU_FXT_PREFIX/distrib.data.5 -o -f $STARPU_FXT_PREFIX/distrib.data.6 \) ]
 
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_data_trace $PREFIX/prof_file_${USER}_0 starpu_sgemm_gemm
-	[ -f data_trace.gp ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_data_trace -d $STARPU_FXT_PREFIX $STARPU_FXT_PREFIX/prof_file_${USER}_0 starpu_sgemm_gemm
+	[ -f $STARPU_FXT_PREFIX/data_trace.gp ]
 
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_stats -i $PREFIX/prof_file_${USER}_0
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_tasks_rec_complete tasks.rec tasks2.rec
-	python $PREFIX/../../tools/starpu_trace_state_stats.py trace.rec
-	$PREFIX/../../tools/starpu_workers_activity activity.data
-	[ -f activity.eps ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_stats -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_tasks_rec_complete $STARPU_FXT_PREFIX/tasks.rec $STARPU_FXT_PREFIX/tasks2.rec
+	python3 $PREFIX/../../tools/starpu_trace_state_stats.py $STARPU_FXT_PREFIX/trace.rec
+	$PREFIX/../../tools/starpu_workers_activity -d $STARPU_FXT_PREFIX $STARPU_FXT_PREFIX/activity.data
+	[ -f $STARPU_FXT_PREFIX/activity.eps ]
 
 	# needs some R packages
-	$PREFIX/../../tools/starpu_paje_draw_histogram paje.trace || true
-	$PREFIX/../../tools/starpu_paje_state_stats paje.trace || true
-	$PREFIX/../../tools/starpu_paje_summary paje.trace || true
-	$PREFIX/../../tools/starpu_codelet_histo_profile distrib.data || true
-	[ -f distrib.data.starpu_sgemm_gemm.0.492beed5.33177600.pdf ] || true
+	$PREFIX/../../tools/starpu_paje_draw_histogram $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_paje_state_stats $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_paje_summary $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_codelet_histo_profile $STARPU_FXT_PREFIX/distrib.data || true
+	[ -f $STARPU_FXT_PREFIX/distrib.data.starpu_sgemm_gemm.0.492beed5.33177600.pdf ] || true
 
 	if [ -x $PREFIX/../../tools/starpu_replay ]; then
-		$STARPU_LAUNCH $PREFIX/../../tools/starpu_replay tasks.rec
+		$STARPU_LAUNCH $PREFIX/../../tools/starpu_replay $STARPU_FXT_PREFIX/tasks.rec
 	fi
 
-	[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump tasks.rec -o perfs2.rec
-	[ -f perfs2.rec ]
+	[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump $STARPU_FXT_PREFIX/tasks.rec -o $STARPU_FXT_PREFIX/perfs2.rec
+	[ -f $STARPU_FXT_PREFIX/perfs2.rec ]
 fi
 
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -s starpu_sgemm_gemm
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -x -s starpu_sgemm_gemm
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump -o $STARPU_FXT_PREFIX/perfs.rec
+[ -f $STARPU_FXT_PREFIX/perfs.rec ]

+ 16 - 0
examples/native_fortran/nf_vector.f90

@@ -23,6 +23,7 @@ program nf_vector
         integer, dimension(:), allocatable, target :: vb
         integer :: i
 
+        type(c_ptr) :: perfmodel_vec   ! a pointer for the perfmodel structure
         type(c_ptr) :: cl_vec   ! a pointer for the codelet structure
         type(c_ptr) :: dh_va    ! a pointer for the 'va' vector data handle
         type(c_ptr) :: dh_vb    ! a pointer for the 'vb' vector data handle
@@ -48,12 +49,24 @@ program nf_vector
                 stop 77
         end if
 
+        ! allocate an empty perfmodel structure
+        perfmodel_vec = fstarpu_perfmodel_allocate()
+
+        ! set the perfmodel symbol
+        call fstarpu_perfmodel_set_symbol(perfmodel_vec, C_CHAR_"my_vec_sym"//C_NULL_CHAR)
+
+        ! set the perfmodel type
+        call fstarpu_perfmodel_set_type(perfmodel_vec, FSTARPU_HISTORY_BASED)
+
         ! allocate an empty codelet structure
         cl_vec = fstarpu_codelet_allocate()
 
         ! set the codelet name
         call fstarpu_codelet_set_name(cl_vec, C_CHAR_"my_vec_codelet"//C_NULL_CHAR)
 
+        ! set the codelet perfmodel
+        call fstarpu_codelet_set_model(cl_vec, perfmodel_vec)
+
         ! add a CPU implementation function to the codelet
         call fstarpu_codelet_add_cpu_func(cl_vec, C_FUNLOC(cl_cpu_func_vec))
 
@@ -98,6 +111,9 @@ program nf_vector
         ! shut StarPU down
         call fstarpu_shutdown()
 
+        ! free perfmodel structure (must be called after fstarpu_shutdown)
+        call fstarpu_perfmodel_free(perfmodel_vec)
+
         deallocate(vb)
         deallocate(va)
 

+ 29 - 5
examples/scheduler/schedulers.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
@@ -36,9 +36,33 @@ else
 	SCHEDULERS=`$basedir/../../tools/starpu_sched_display | grep -v heteroprio`
 fi
 
-for sched in $SCHEDULERS
-do
+run()
+{
+    sched=$1
     echo "cholesky.$sched"
-    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../cholesky/cholesky_tag -size $((960*3)) -nblocks 3
+    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../cholesky/cholesky_tag -size $((320*3)) -nblocks 3
     check_success $?
-done
+}
+
+case "$MAKEFLAGS" in
+    *\ -j1[0-9]*\ *|*\ -j[2-9]*\ *)
+	for sched in $SCHEDULERS
+	do
+		run $sched &
+	done
+	while true
+	do
+		wait -n
+		RET=$?
+		if [ $RET = 127 ] ; then break ; fi
+		check_success $RET
+	done
+    ;;
+
+    *)
+	for sched in $SCHEDULERS
+	do
+		run $sched
+	done
+    ;;
+esac

+ 4 - 3
examples/stencil/Makefile.am

@@ -16,9 +16,10 @@
 include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) $(FXT_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 if STARPU_USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -73,7 +74,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 if !STARPU_CROSS_COMPILING
 LOADER			=	loader
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	./$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
 noinst_PROGRAMS		+=	loader

+ 57 - 0
include/fstarpu_mod.f90

@@ -92,6 +92,14 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
+        !type(c_ptr), bind(C) :: FSTARPU_PER_WORKER
+        !type(c_ptr), bind(C) :: FSTARPU_PER_ARCH
+        !type(c_ptr), bind(C) :: FSTARPU_PER_COMMON
+        type(c_ptr), bind(C) :: FSTARPU_HISTORY_BASED
+        type(c_ptr), bind(C) :: FSTARPU_REGRESSION_BASED
+        type(c_ptr), bind(C) :: FSTARPU_NL_REGRESSION_BASED
+        type(c_ptr), bind(C) :: FSTARPU_MULTIPLE_REGRESSION_BASED
+
         ! (some) portable iso_c_binding types
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_DOUBLE
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_FLOAT
@@ -649,6 +657,18 @@ module fstarpu_mod
                         character(c_char), intent(in) :: cl_name
                 end subroutine fstarpu_codelet_set_name
 
+                subroutine fstarpu_codelet_set_model (cl, cl_perfmodel) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: cl_perfmodel
+                end subroutine fstarpu_codelet_set_model
+
+                subroutine fstarpu_codelet_set_energy_model (cl, cl_perfmodel) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: cl_perfmodel
+                end subroutine fstarpu_codelet_set_energy_model
+
                 subroutine fstarpu_codelet_add_cpu_func (cl, f_ptr) bind(C)
                         use iso_c_binding, only: c_ptr, c_funptr
                         type(c_ptr), value, intent(in) :: cl
@@ -714,6 +734,28 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: where ! C function expects an intptr_t
                 end subroutine fstarpu_codelet_set_where
 
+                function fstarpu_perfmodel_allocate () bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr) :: fstarpu_perfmodel_allocate
+                end function fstarpu_perfmodel_allocate
+
+                subroutine fstarpu_perfmodel_free (model) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: model
+                end subroutine fstarpu_perfmodel_free
+
+                subroutine fstarpu_perfmodel_set_symbol (model, model_symbol) bind(C)
+                        use iso_c_binding, only: c_ptr, c_char
+                        type(c_ptr), value, intent(in) :: model
+                        character(c_char), intent(in) :: model_symbol
+                end subroutine fstarpu_perfmodel_set_symbol
+
+                subroutine fstarpu_perfmodel_set_type (model, type) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: model
+                        type(c_ptr), value, intent(in) :: type ! C function expects an intptr_t
+                end subroutine fstarpu_perfmodel_set_type
+
                 ! == starpu_data_interface.h ==
 
                 ! uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
@@ -2434,6 +2476,21 @@ module fstarpu_mod
                         FSTARPU_OPENCL_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL_ASYNC"//C_NULL_CHAR)
 
+                        !FSTARPU_PER_WORKER = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_WORKER"//C_NULL_CHAR)
+                        !FSTARPU_PER_ARCH = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_ARCH"//C_NULL_CHAR)
+                        !FSTARPU_PER_COMMON = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_COMMON"//C_NULL_CHAR)
+                        FSTARPU_HISTORY_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_HISTORY_BASED"//C_NULL_CHAR)
+                        FSTARPU_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_REGRESSION_BASED"//C_NULL_CHAR)
+                        FSTARPU_NL_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_NL_REGRESSION_BASED"//C_NULL_CHAR)
+                        FSTARPU_MULTIPLE_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_MULTIPLE_REGRESSION_BASED"//C_NULL_CHAR)
+
                         ! Initialize size constants as 'c_ptr'
                         FSTARPU_SZ_C_DOUBLE        = sz_to_p(c_sizeof(FSTARPU_SZ_C_DOUBLE_dummy))
                         FSTARPU_SZ_C_FLOAT        = sz_to_p(c_sizeof(FSTARPU_SZ_C_FLOAT_dummy))

+ 14 - 0
include/starpu.h

@@ -196,6 +196,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	unsigned use_explicit_workers_bindid;
+
 	/**
 	   If the starpu_conf::use_explicit_workers_bindid flag is
 	   set, this array indicates where to bind the different
@@ -217,6 +218,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	unsigned use_explicit_workers_cuda_gpuid;
+
 	/**
 	   If the starpu_conf::use_explicit_workers_cuda_gpuid flag is
 	   set, this array contains the logical identifiers of the
@@ -234,6 +236,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	unsigned use_explicit_workers_opencl_gpuid;
+
 	/**
 	   If the starpu_conf::use_explicit_workers_opencl_gpuid flag
 	   is set, this array contains the logical identifiers of the
@@ -251,6 +254,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	unsigned use_explicit_workers_mic_deviceid;
+
 	/**
 	   If the flag starpu_conf::use_explicit_workers_mic_deviceid
 	   is set, the array contains the logical identifiers of the
@@ -267,6 +271,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	unsigned use_explicit_workers_mpi_ms_deviceid;
+
 	/**
 	   If the flag
 	   starpu_conf::use_explicit_workers_mpi_ms_deviceid is set,
@@ -283,6 +288,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	int bus_calibrate;
+
 	/**
 	   If this flag is set, StarPU will calibrate the performance
 	   models when executing tasks. If this value is equal to -1,
@@ -333,6 +339,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	int disable_asynchronous_copy;
+
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and CUDA accelerators.
@@ -345,6 +352,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	int disable_asynchronous_cuda_copy;
+
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and OpenCL accelerators.
@@ -361,6 +369,7 @@ struct starpu_conf
 	   (default = 0)
 	*/
 	int disable_asynchronous_opencl_copy;
+
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and MIC accelerators.
@@ -373,6 +382,7 @@ struct starpu_conf
 	   (default = 0).
 	*/
 	int disable_asynchronous_mic_copy;
+
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and MPI Master Slave devices.
@@ -395,6 +405,7 @@ struct starpu_conf
 	   (default = <c>NULL</c>)
 	*/
 	unsigned *cuda_opengl_interoperability;
+
 	/**
 	   Size of the array starpu_conf::cuda_opengl_interoperability
 	*/
@@ -406,6 +417,7 @@ struct starpu_conf
 	   (default = <c>NULL</c>)
 	*/
 	struct starpu_driver *not_launched_drivers;
+
 	/**
 	   The number of StarPU drivers that should not be launched by
 	   StarPU, i.e number of elements of the array
@@ -422,7 +434,9 @@ struct starpu_conf
 	   would disturb the trace).
 	*/
 	uint64_t trace_buffer_size;
+
 	int global_sched_ctx_min_priority;
+
 	int global_sched_ctx_max_priority;
 
 #ifdef STARPU_WORKER_CALLBACKS

+ 1 - 1
include/starpu_clusters.h

@@ -122,7 +122,7 @@ struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_l
 int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
 int starpu_cluster_print(struct starpu_cluster_machine* clusters);
 
-/* Prologue functions */
+/** Prologue functions */
 void starpu_openmp_prologue(void*);
 #define starpu_intel_openmp_mkl_prologue starpu_openmp_prologue
 #ifdef STARPU_MKL

+ 5 - 5
include/starpu_config.h.in

@@ -211,9 +211,9 @@
 #undef STARPU_MAXNUMANODES
 
 /**
- * Define the maximum number of CUDA devices that are supported by StarPU.
- * @ingroup API_CUDA_Extensions
- */
+   Define the maximum number of CUDA devices that are supported by StarPU.
+   @ingroup API_CUDA_Extensions
+*/
 #undef STARPU_MAXCUDADEVS
 
 /**
@@ -304,10 +304,10 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_PTHREAD_COND_INITIALIZER_ZERO
 #undef STARPU_PTHREAD_RWLOCK_INITIALIZER_ZERO
 
-/* This is only for building examples */
+/** This is only for building examples */
 #undef STARPU_HAVE_HELGRIND_H
 
-/* Enable Fortran to C MPI interface */
+/** Enable Fortran to C MPI interface */
 #undef  HAVE_MPI_COMM_F2C
 
 #undef STARPU_HAVE_DARWIN

+ 3 - 1
include/starpu_data.h

@@ -115,7 +115,9 @@ enum starpu_data_access_mode
 
 struct starpu_data_interface_ops;
 
-/** Set the name of the data, to be shown in various profiling tools. */
+/**
+   Set the name of the data, to be shown in various profiling tools.
+*/
 void starpu_data_set_name(starpu_data_handle_t handle, const char *name);
 
 /**

+ 4 - 0
include/starpu_fxt.h

@@ -69,6 +69,7 @@ struct starpu_fxt_options
 	char *number_events_path;
 	char *anim_path;
 	char *states_path;
+	char *dir;
 	char worker_names[STARPU_NMAXWORKERS][256];
 	int nworkers;
 	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
@@ -104,6 +105,7 @@ struct starpu_fxt_options
 };
 
 void starpu_fxt_options_init(struct starpu_fxt_options *options);
+void starpu_fxt_options_shutdown(struct starpu_fxt_options *options);
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options);
 
 /**
@@ -129,7 +131,9 @@ void starpu_fxt_start_profiling(void);
    start recording it again, etc.
 */
 void starpu_fxt_stop_profiling(void);
+
 void starpu_fxt_write_data_trace(char *filename_in);
+void starpu_fxt_write_data_trace_in_dir(char *filename_in, char *dir);
 
 /**
     Wrapper to get value of env variable STARPU_FXT_TRACE

+ 15 - 1
include/starpu_task.h

@@ -546,6 +546,20 @@ struct starpu_codelet
 	unsigned color;
 
 	/**
+	   Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void *)</c>
+	   which specifies a possible callback. If this pointer is
+	   non-<c>NULL</c>, the callback function is executed on the
+	   host after the execution of the task. If the task defines a
+	   callback, the codelet callback is not called, unless called
+	   within the task callback function.
+	   The callback is passed the value contained in the
+	   starpu_task::callback_arg field. No callback is executed if
+	   the field is set to <c>NULL</c>.
+	*/
+	void (*callback_func)(void *);
+
+	/**
 	   Various flags for the codelet.
 	 */
 	int flags;
@@ -763,7 +777,7 @@ struct starpu_task
 	   <c>NULL</c>.
 
 	   With starpu_task_insert() and alike this can be specified thanks to
-	   ::STARPU_CALLBACK_ARG followed by the function pointer, or thanks to
+	   ::STARPU_CALLBACK_ARG followed by the argument pointer, or thanks to
 	   ::STARPU_CALLBACK_WITH_ARG or
 	   ::STARPU_CALLBACK_WITH_ARG_NFREE followed by the function
 	   pointer and the argument.

+ 3 - 0
include/starpu_worker.h

@@ -302,6 +302,9 @@ struct starpu_tree* starpu_workers_get_tree(void);
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
 
+/**
+   Return whether worker \p workerid is currently blocked in a parallel task.
+ */
 unsigned starpu_worker_is_blocked_in_parallel(int workerid);
 
 unsigned starpu_worker_is_slave_somewhere(int workerid);

+ 3 - 3
julia/examples/Makefile.am

@@ -20,7 +20,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
@@ -96,9 +96,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
 SHELL_TESTS	=

+ 3 - 3
julia/examples/old_examples/mult/mult.c

@@ -204,9 +204,9 @@ double median_time(unsigned nb_test, unsigned xdim, unsigned ydim, unsigned zdim
 {
 	unsigned i;
 
-	float * A = (float *) valloc(zdim*ydim*sizeof(float));
-	float * B = (float *) valloc(xdim*zdim*sizeof(float));
-	float * C = (float *) valloc(xdim*ydim*sizeof(float));
+	float * A = (float *) malloc(zdim*ydim*sizeof(float));
+	float * B = (float *) malloc(xdim*zdim*sizeof(float));
+	float * C = (float *) malloc(xdim*ydim*sizeof(float));
 
 	double exec_times[nb_test];
 

+ 2 - 2
julia/src/Makefile.am

@@ -19,9 +19,9 @@ include $(top_srcdir)/starpu-notests.mk
 CLEANFILES = *.gcno *.gcda
 
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 
 SUBDIRS = dynamic_compiler
 

+ 2 - 2
julia/src/dynamic_compiler/Makefile.am

@@ -20,9 +20,9 @@ AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
+LIBS += -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
-LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
 

+ 45 - 0
mpi/GNUmakefile.in

@@ -0,0 +1,45 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+SUBDIRS=
+
+@STARPU_BUILD_EXAMPLES_TRUE@SUBDIRS += examples
+@STARPU_BUILD_TESTS_TRUE@SUBDIRS += tests
+
+check: check-recursive
+
+# divide by 4 the number of jobs to run in parallel, since mpirun will start 4
+# processes in the tests and examples
+@STARPU_SIMGRID_FALSE@check-recursive:
+@STARPU_SIMGRID_FALSE@	RET=0 ; \
+@STARPU_SIMGRID_FALSE@	NJOBS=`printf %s "$(MAKEFLAGS)" | sed -ne 's/.*-j \?\([0-9]\+\).*/\1/p'` ; \
+@STARPU_SIMGRID_FALSE@	JOBS="" ; \
+@STARPU_SIMGRID_FALSE@	if [ -n "$$NJOBS" ] ; then \
+@STARPU_SIMGRID_FALSE@		if [ "$$NJOBS" -ge 4 ] ; then \
+@STARPU_SIMGRID_FALSE@			JOBS="-j$$(($$NJOBS / 4))" ; \
+@STARPU_SIMGRID_FALSE@		else \
+@STARPU_SIMGRID_FALSE@			JOBS="-j1" ; \
+@STARPU_SIMGRID_FALSE@		fi ; \
+@STARPU_SIMGRID_FALSE@	fi ; \
+@STARPU_SIMGRID_FALSE@	for i in $(SUBDIRS) ; do \
+@STARPU_SIMGRID_FALSE@		$(MAKE) check -C $$i MAKEFLAGS="$(MAKEFLAGS) $$JOBS" || RET=1; \
+@STARPU_SIMGRID_FALSE@	done ; \
+@STARPU_SIMGRID_FALSE@	exit $$RET
+
+%: force
+	@$(MAKE) -f Makefile $@
+
+force: ;

+ 0 - 18
mpi/Makefile.am

@@ -33,21 +33,3 @@ versinclude_HEADERS = 					\
 	include/starpu_mpi.h				\
 	include/starpu_mpi_lb.h				\
 	include/fstarpu_mpi_mod.f90
-
-if !STARPU_SIMGRID
-check-recursive:
-	RET=0 ; \
-	NJOBS=`printf %s "$(MAKEFLAGS)" | sed -ne 's/.*-j \?\([0-9]\+\).*/\1/p'` ; \
-	JOBS="" ; \
-	if [ -n "$$NJOBS" ] ; then \
-		if [ "$$NJOBS" -ge 4 ] ; then \
-			JOBS="-j$$(($$NJOBS / 4))" ; \
-		else \
-			JOBS="-j1" ; \
-		fi ; \
-	fi ; \
-	for i in $(SUBDIRS) ; do \
-		$(MAKE) check -C $$i MAKEFLAGS="$(MAKEFLAGS) $$JOBS" || RET=1; \
-	done ; \
-	exit $$RET
-endif

+ 3 - 4
mpi/examples/Makefile.am

@@ -26,7 +26,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
@@ -108,9 +108,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 ###################
 # Stencil example #
@@ -466,7 +466,6 @@ benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
 
 benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
 benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
-benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
 
 benchs_burst_SOURCES = benchs/burst.c
 benchs_burst_SOURCES += benchs/burst_helper.c

+ 27 - 6
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -19,7 +19,7 @@
 
 
 
-void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier, int bidir)
 {
 	uint64_t iterations = LOOPS_DEFAULT;
 	uint64_t s = 0;
@@ -62,6 +62,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 	float* vector_recv = NULL;
 	double t1, t2, global_tstart, global_tend;
 	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
+	starpu_mpi_req send_req, recv_req;
 
 	if (thread_barrier != NULL)
 	{
@@ -88,18 +89,38 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 			if (mpi_rank == 0)
 			{
 				t1 = starpu_timing_now();
-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				if (bidir)
+				{
+					starpu_mpi_isend(handle_send, &send_req, 1, 0, MPI_COMM_WORLD);
+					starpu_mpi_irecv(handle_recv, &recv_req, 1, 1, MPI_COMM_WORLD);
+					starpu_mpi_wait(&send_req, MPI_STATUS_IGNORE);
+					starpu_mpi_wait(&recv_req, MPI_STATUS_IGNORE);
+				}
+				else
+				{
+					starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+					starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				}
 				t2 = starpu_timing_now();
 
-				const double t = (t2 -t1) / 2;
+				const double t = (t2 - t1) / 2;
 
 				lats[j] = t;
 			}
 			else
 			{
-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+				if (bidir)
+				{
+					starpu_mpi_irecv(handle_recv, &recv_req, 0, 0, MPI_COMM_WORLD);
+					starpu_mpi_isend(handle_send, &send_req, 0, 1, MPI_COMM_WORLD);
+					starpu_mpi_wait(&recv_req, MPI_STATUS_IGNORE);
+					starpu_mpi_wait(&send_req, MPI_STATUS_IGNORE);
+				}
+				else
+				{
+					starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+					starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+				}
 			}
 
 			starpu_mpi_barrier(MPI_COMM_WORLD);

+ 1 - 1
mpi/examples/benchs/abstract_sendrecv_bench.h

@@ -17,4 +17,4 @@
 #include <starpu.h>
 
 
-void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier, int bidir);

+ 21 - 6
mpi/examples/benchs/sendrecv_bench.c

@@ -16,6 +16,8 @@
 /*
  * Basic send receive benchmark.
  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
+ *
+ * The option --bidir is available to do full-duplex communications.
  */
 
 #include <starpu_mpi.h>
@@ -23,11 +25,22 @@
 #include "abstract_sendrecv_bench.h"
 
 
+static inline void man()
+{
+	fprintf(stderr, "Options:\n");
+	fprintf(stderr, "\t-h --help   display this help\n");
+	fprintf(stderr, "\t-p          pause workers during benchmark\n");
+	fprintf(stderr, "\t--bidir     full-duplex communications\n");
+	exit(EXIT_SUCCESS);
+}
+
+
 int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
 	int pause_workers = 0;
 	int i = 0;
+	int bidir = 0;
 
 
 	for (i = 1; i < argc; i++)
@@ -39,15 +52,17 @@ int main(int argc, char **argv)
 		}
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			fprintf(stderr, "Options:\n");
-			fprintf(stderr, "\t-h --help   display this help\n");
-			fprintf(stderr, "\t-p          pause workers during benchmark\n");
-			exit(EXIT_SUCCESS);
+			man();
+		}
+		if (strcmp(argv[i], "--bidir") == 0)
+		{
+			bidir = 1;
+			printf("Communications will be full-duplex.\n");
 		}
 		else
 		{
 			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
-			exit(EXIT_FAILURE);
+			man();
 		}
 	}
 
@@ -75,7 +90,7 @@ int main(int argc, char **argv)
 		starpu_pause();
 	}
 
-	sendrecv_bench(rank, NULL);
+	sendrecv_bench(rank, NULL, bidir);
 
 	if (pause_workers)
 	{

+ 1 - 1
mpi/examples/benchs/sendrecv_gemm_bench.c

@@ -56,7 +56,7 @@ static void* comm_thread_func(void* arg)
 		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
 	}
 
-	sendrecv_bench(mpi_rank, &thread_barrier);
+	sendrecv_bench(mpi_rank, &thread_barrier, /* half-duplex communications */ 0);
 
 	return NULL;
 }

+ 0 - 1
mpi/examples/benchs/sendrecv_parallel_tasks_bench.c

@@ -34,7 +34,6 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "bench_helper.h"
-#include "abstract_sendrecv_bench.h"
 
 #define NB_WARMUP_PINGPONGS 10
 

+ 1 - 1
mpi/examples/matrix_mult/mm.c

@@ -128,7 +128,7 @@ static void register_matrices()
 	int mr = (comm_rank == 0) ? STARPU_MAIN_RAM : -1;
 
 	/* mpi tag used for the block */
-	int tag = 0;
+	starpu_mpi_tag_t tag = 0;
 
 	int b_row,b_col;
 

+ 4 - 1
mpi/examples/mpi_lu/plu_example.c

@@ -133,7 +133,10 @@ static void parse_args(int rank, int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 4 - 1
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -112,7 +112,10 @@ static void parse_args(int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 4 - 1
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -123,7 +123,10 @@ static void parse_args(int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 9 - 9
mpi/examples/mpi_lu/pxlu.c

@@ -90,7 +90,7 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 /* Send handle to every node appearing in the mask, and unlock tag once the
  * transfers are done. */
-static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, starpu_mpi_tag_t mpi_tag, starpu_tag_t tag)
 {
 	unsigned cnt = 0;
 
@@ -134,7 +134,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 struct recv_when_done_callback_arg
 {
 	int source;
-	int mpi_tag;
+	starpu_mpi_tag_t mpi_tag;
 	starpu_data_handle_t handle;
 	starpu_tag_t unlocked_tag;
 };
@@ -150,7 +150,7 @@ static void callback_receive_when_done(void *_arg)
 }
 
 static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
-				int source, int mpi_tag,
+				int source, starpu_mpi_tag_t mpi_tag,
 				starpu_data_handle_t handle,
 				starpu_tag_t partial_tag,
 				starpu_tag_t unlocked_tag)
@@ -218,7 +218,7 @@ static void create_task_11_recv(unsigned k)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
 #endif
-	int mpi_tag = MPI_TAG11(k);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG11(k);
 	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
 	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
 
@@ -260,7 +260,7 @@ static void callback_task_11_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
 	starpu_tag_t tag = TAG11_SAVE(k);
-	int mpi_tag = MPI_TAG11(k);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG11(k);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);
@@ -380,7 +380,7 @@ static void create_task_12_recv(unsigned k, unsigned j)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
 #endif
-	int mpi_tag = MPI_TAG12(k, j);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG12(k, j);
 	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
 	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
 
@@ -415,7 +415,7 @@ static void callback_task_12_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
 	starpu_tag_t tag = TAG12_SAVE(k, j);
-	int mpi_tag = MPI_TAG12(k, j);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG12(k, j);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);
@@ -564,7 +564,7 @@ static void create_task_21_recv(unsigned k, unsigned i)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
 #endif
-	int mpi_tag = MPI_TAG21(k, i);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG21(k, i);
 	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
 	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
 
@@ -600,7 +600,7 @@ static void callback_task_21_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
 	starpu_tag_t tag = TAG21_SAVE(k, i);
-	int mpi_tag = MPI_TAG21(k, i);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG21(k, i);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);

+ 4 - 5
mpi/src/Makefile.am

@@ -24,9 +24,10 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) $(NMAD_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
+LIBS += $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
 
 ldflags =
 
@@ -54,10 +55,8 @@ endif STARPU_HAVE_WINDOWS
 
 lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
-libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
-  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
-  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE)
 noinst_HEADERS =					\
 	starpu_mpi_private.h				\
 	starpu_mpi_fxt.h				\

+ 22 - 20
mpi/src/load_balancer/policy/data_movements_interface.c

@@ -23,7 +23,7 @@
 
 #if defined(STARPU_USE_MPI_MPI)
 
-int **data_movements_get_ref_tags_table(starpu_data_handle_t handle)
+starpu_mpi_tag_t **data_movements_get_ref_tags_table(starpu_data_handle_t handle)
 {
 	struct data_movements_interface *dm_interface =
 		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
@@ -45,7 +45,7 @@ int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle)
 		return NULL;
 }
 
-int *data_movements_get_tags_table(starpu_data_handle_t handle)
+starpu_mpi_tag_t *data_movements_get_tags_table(starpu_data_handle_t handle)
 {
 	struct data_movements_interface *dm_interface =
 		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
@@ -94,8 +94,8 @@ int data_movements_reallocate_tables(starpu_data_handle_t handle, int size)
 
 	if (dm_interface->size)
 	{
-		_STARPU_MPI_MALLOC(dm_interface->tags, size*sizeof(int));
-		_STARPU_MPI_MALLOC(dm_interface->ranks, size*sizeof(int));
+		_STARPU_MPI_MALLOC(dm_interface->tags, size*sizeof(*dm_interface->tags));
+		_STARPU_MPI_MALLOC(dm_interface->ranks, size*sizeof(*dm_interface->ranks));
 	}
 
 	return 0 ;
@@ -129,14 +129,15 @@ static starpu_ssize_t data_movements_allocate_data_on_node(void *data_interface,
 {
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
 
-	int *addr_tags;
+	starpu_mpi_tag_t *addr_tags;
 	int *addr_ranks;
-	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
+	starpu_ssize_t requested_memory_tags = dm_interface->size * sizeof(starpu_mpi_tag_t);
+	starpu_ssize_t requested_memory_ranks = dm_interface->size * sizeof(int);
 
-	addr_tags = (int*) starpu_malloc_on_node(node, requested_memory);
+	addr_tags = (starpu_mpi_tag_t*) starpu_malloc_on_node(node, requested_memory_tags);
 	if (!addr_tags)
 		goto fail_tags;
-	addr_ranks = (int*) starpu_malloc_on_node(node, requested_memory);
+	addr_ranks = (int*) starpu_malloc_on_node(node, requested_memory_ranks);
 	if (!addr_ranks)
 		goto fail_ranks;
 
@@ -144,10 +145,10 @@ static starpu_ssize_t data_movements_allocate_data_on_node(void *data_interface,
 	dm_interface->tags = addr_tags;
 	dm_interface->ranks = addr_ranks;
 
-	return 2*requested_memory;
+	return requested_memory_tags+requested_memory_ranks;
 
 fail_ranks:
-	starpu_free_on_node(node, (uintptr_t) addr_tags, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) addr_tags, requested_memory_tags);
 fail_tags:
 	return -ENOMEM;
 }
@@ -155,10 +156,11 @@ fail_tags:
 static void data_movements_free_data_on_node(void *data_interface, unsigned node)
 {
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
-	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
+	starpu_ssize_t requested_memory_tags = dm_interface->size * sizeof(starpu_mpi_tag_t);
+	starpu_ssize_t requested_memory_ranks = dm_interface->size * sizeof(int);
 
-	starpu_free_on_node(node, (uintptr_t) dm_interface->tags, requested_memory);
-	starpu_free_on_node(node, (uintptr_t) dm_interface->ranks, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) dm_interface->tags, requested_memory_tags);
+	starpu_free_on_node(node, (uintptr_t) dm_interface->ranks, requested_memory_ranks);
 }
 
 static size_t data_movements_get_size(starpu_data_handle_t handle)
@@ -166,7 +168,7 @@ static size_t data_movements_get_size(starpu_data_handle_t handle)
 	size_t size;
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
-	size = (dm_interface->size * 2 * sizeof(int)) + sizeof(int);
+	size = (dm_interface->size * sizeof(starpu_mpi_tag_t)) + (dm_interface->size * sizeof(int)) + sizeof(int);
 	return size;
 }
 
@@ -192,8 +194,8 @@ static int data_movements_pack_data(starpu_data_handle_t handle, unsigned node,
 		memcpy(data, &dm_interface->size, sizeof(int));
 		if (dm_interface->size)
 		{
-			memcpy(data+sizeof(int), dm_interface->tags, (dm_interface->size*sizeof(int)));
-			memcpy(data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->ranks, dm_interface->size*sizeof(int));
+			memcpy(data+sizeof(int), dm_interface->tags, (dm_interface->size*sizeof(starpu_mpi_tag_t)));
+			memcpy(data+sizeof(int)+(dm_interface->size*sizeof(starpu_mpi_tag_t)), dm_interface->ranks, dm_interface->size*sizeof(int));
 		}
 	}
 
@@ -216,8 +218,8 @@ static int data_movements_unpack_data(starpu_data_handle_t handle, unsigned node
 
 	if (dm_interface->size)
 	{
-		memcpy(dm_interface->tags, data+sizeof(int), dm_interface->size*sizeof(int));
-		memcpy(dm_interface->ranks, data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->size*sizeof(int));
+		memcpy(dm_interface->tags, data+sizeof(int), dm_interface->size*sizeof(starpu_mpi_tag_t));
+		memcpy(dm_interface->ranks, data+sizeof(int)+(dm_interface->size*sizeof(starpu_mpi_tag_t)), dm_interface->size*sizeof(int));
 	}
 
     return 0;
@@ -233,7 +235,7 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 
 	if (starpu_interface_copy((uintptr_t) src_data_movements->tags, 0, src_node,
 				    (uintptr_t) dst_data_movements->tags, 0, dst_node,
-				     src_data_movements->size*sizeof(int),
+				     src_data_movements->size*sizeof(starpu_mpi_tag_t),
 				     async_data))
 		ret = -EAGAIN;
 	if (starpu_interface_copy((uintptr_t) src_data_movements->ranks, 0, src_node,
@@ -265,7 +267,7 @@ static struct starpu_data_interface_ops interface_data_movements_ops =
 	.describe = NULL
 };
 
-void data_movements_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int *ranks, int *tags, int size)
+void data_movements_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int *ranks, starpu_mpi_tag_t *tags, int size)
 {
 	struct data_movements_interface data_movements =
 	{

+ 10 - 8
mpi/src/load_balancer/policy/data_movements_interface.h

@@ -16,27 +16,29 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifndef __DATA_MOVEMENTS_INTERFACE_H
 #define __DATA_MOVEMENTS_INTERFACE_H
 
-/* interface for data_movements */
+/** interface for data_movements */
 struct data_movements_interface
 {
-	/* Data tags table */
-	int *tags;
-	/* Ranks table (where to move the corresponding data) */
+	/** Data tags table */
+	starpu_mpi_tag_t *tags;
+	/** Ranks table (where to move the corresponding data) */
 	int *ranks;
-	/* Size of the tables */
+	/** Size of the tables */
 	int size;
 };
 
-void data_movements_data_register(starpu_data_handle_t *handle, unsigned home_node, int *ranks, int *tags, int size);
+void data_movements_data_register(starpu_data_handle_t *handle, unsigned home_node, int *ranks, starpu_mpi_tag_t *tags, int size);
 
-int **data_movements_get_ref_tags_table(starpu_data_handle_t handle);
+starpu_mpi_tag_t **data_movements_get_ref_tags_table(starpu_data_handle_t handle);
 int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle);
 int data_movements_reallocate_tables(starpu_data_handle_t handle, int size);
 
-int *data_movements_get_tags_table(starpu_data_handle_t handle);
+starpu_mpi_tag_t *data_movements_get_tags_table(starpu_data_handle_t handle);
 int *data_movements_get_ranks_table(starpu_data_handle_t handle);
 int data_movements_get_size_tables(starpu_data_handle_t handle);
 

+ 4 - 2
mpi/src/load_balancer/policy/load_balancer_policy.h

@@ -19,12 +19,14 @@
 
 #include <starpu_mpi_lb.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 
-/* A load balancer consists in a collection of operations on a data
+/** A load balancer consists in a collection of operations on a data
  * representing the load of the application (in terms of computation, memory,
  * whatever). StarPU allows several entry points for the user. The load
  * balancer allows the user to give its load balancing methods to be used on
@@ -36,7 +38,7 @@ struct load_balancer_policy
 	void (*submitted_task_entry_point)();
 	void (*finished_task_entry_point)();
 
-	/* Name of the load balancing policy. The selection of the load balancer is
+	/** Name of the load balancing policy. The selection of the load balancer is
 	 * performed through the use of the STARPU_MPI_LB=name environment
 	 * variable.
 	 */

+ 11 - 9
mpi/src/load_balancer/policy/load_data_interface.h

@@ -16,29 +16,31 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifndef __LOAD_DATA_INTERFACE_H
 #define __LOAD_DATA_INTERFACE_H
 
-/* interface for load_data */
+/** interface for load_data */
 struct load_data_interface
 {
-	/* Starting time of the execution */
+	/** Starting time of the execution */
 	double start;
-	/* Elapsed time until the start time and the time when event "launch a load
+	/** Elapsed time until the start time and the time when event "launch a load
 	 * balancing phase" is triggered */
 	double elapsed_time;
-	/* Current submission phase, i.e how many balanced steps have already
+	/** Current submission phase, i.e how many balanced steps have already
 	 * happened so far. */
 	int phase;
-	/* Number of currently submitted tasks */
+	/** Number of currently submitted tasks */
 	int nsubmitted_tasks;
-	/* Number of currently finished tasks */
+	/** Number of currently finished tasks */
 	int nfinished_tasks;
-	/* Task threshold to sleep the submission thread */
+	/** Task threshold to sleep the submission thread */
 	int sleep_task_threshold;
-	/* Task threshold to wake-up the submission thread */
+	/** Task threshold to wake-up the submission thread */
 	int wakeup_task_threshold;
-	/* Ratio of submitted tasks to wait for completion before waking up the
+	/** Ratio of submitted tasks to wait for completion before waking up the
 	 * submission thread */
 	double wakeup_ratio;
 };

+ 6 - 6
mpi/src/load_balancer/policy/load_heat_propagation.c

@@ -27,14 +27,14 @@
 
 #if defined(STARPU_USE_MPI_MPI)
 
-static int TAG_LOAD(int n)
+static starpu_mpi_tag_t TAG_LOAD(int n)
 {
-	return (n+1) << 24;
+	return ((starpu_mpi_tag_t) n+1) << 24;
 }
 
-static int TAG_MOV(int n)
+static starpu_mpi_tag_t TAG_MOV(int n)
 {
-	return (n+1) << 20;
+	return ((starpu_mpi_tag_t) n+1) << 20;
 }
 
 /* Hash table of local pieces of data that has been moved out of the local MPI
@@ -132,7 +132,7 @@ static void balance(starpu_data_handle_t load_data_cpy)
 
 			if (nhandles)
 			{
-				int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
+				starpu_mpi_tag_t *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
 				int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
 
 				for (n = 0; n < nhandles; n++)
@@ -564,7 +564,7 @@ static int deinit_heat()
 		_STARPU_DEBUG("Move back %u data on node %d ..\n", ndata_to_move_back, my_rank);
 		data_movements_reallocate_tables(data_movements_handles[my_rank], ndata_to_move_back);
 
-		int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
+		starpu_mpi_tag_t *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
 		int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
 
 		int n = 0;

+ 2 - 0
mpi/src/mpi/starpu_mpi_comm.h

@@ -25,6 +25,8 @@
 
 #include <mpi/starpu_mpi_mpi_backend.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/mpi/starpu_mpi_driver.h

@@ -19,6 +19,8 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_early_data.h

@@ -25,6 +25,8 @@
 #include <common/uthash.h>
 #include <starpu_mpi_private.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_early_request.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 7 - 0
mpi/src/mpi/starpu_mpi_mpi.c

@@ -760,6 +760,12 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
 	_STARPU_MPI_LOG_IN();
 
+	/* FIXME: rather use MPI_Ibarrier and make it a detached request.
+	 * We'd then be able to introduce starpu_mpi_ibarrier, and make
+	 * starpu_mpi_barrier just call starpu_mpi_ibarrier(); starpu_mpi_wait();
+	 * That'll solve locking issue when intermixing starpu_mpi_barrier with
+	 * other communications.
+	 */
 	barrier_req->ret = MPI_Barrier(barrier_req->node_tag.node.comm);
 	STARPU_MPI_ASSERT_MSG(barrier_req->ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(barrier_req->ret));
 
@@ -1507,6 +1513,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		starpu_pthread_wait_wait(&_starpu_mpi_thread_wait);
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 #endif
+		STARPU_VALGRIND_YIELD();
 	}
 
 	_STARPU_MPI_TRACE_POLLING_END();

+ 2 - 0
mpi/src/mpi/starpu_mpi_mpi.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 3 - 1
mpi/src/mpi/starpu_mpi_mpi_backend.h

@@ -20,6 +20,8 @@
 #include <common/config.h>
 #include <common/uthash.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -54,7 +56,7 @@ struct _starpu_mpi_req_backend
 	starpu_pthread_cond_t req_cond;
 	starpu_pthread_mutex_t posted_mutex;
 	starpu_pthread_cond_t posted_cond;
-	/* In the case of a Wait/Test request, we are going to post a request
+	/** In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */
 	struct _starpu_mpi_req *other_request;
 

+ 2 - 0
mpi/src/mpi/starpu_mpi_sync_data.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_tag.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 9 - 0
mpi/src/nmad/starpu_mpi_nmad.c

@@ -634,9 +634,18 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
+#ifdef HAVE_PIOM_LTASK_SET_BOUND_THREAD_OS_INDEXES
+	/* We prefer to give the OS index of the core, because StarPU can have
+	 * a different vision of the topology, especially if STARPU_WORKERS_GETBIND
+	 * is enabled */
+	int indexes[1] = { starpu_get_pu_os_index((unsigned) _starpu_mpi_thread_cpuid) };
+	if (!_starpu_mpi_nobind)
+		piom_ltask_set_bound_thread_os_indexes(HWLOC_OBJ_PU, indexes, 1);
+#else
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
 	if (!_starpu_mpi_nobind)
 		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
+#endif
 
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;

+ 2 - 0
mpi/src/nmad/starpu_mpi_nmad.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_NMAD
 
 #ifdef __cplusplus

+ 3 - 1
mpi/src/nmad/starpu_mpi_nmad_backend.h

@@ -19,6 +19,8 @@
 
 #include <common/config.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -38,7 +40,7 @@ struct _starpu_mpi_req_backend
 	piom_cond_t req_cond;
 	nm_sr_request_t size_req;
 
-	// When datatype is unknown:
+	/** When datatype is unknown */
 	struct nm_data_s unknown_datatype_body;
 	struct nm_data_s unknown_datatype_data;
 	struct nm_data_s unknown_datatype_size;

+ 2 - 0
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h

@@ -19,6 +19,8 @@
 
 #include <common/config.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 14 - 12
mpi/src/starpu_mpi.c

@@ -325,22 +325,23 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 {
-	int me, rank, tag;
+	int me, rank;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	if (rank == -1)
 	{
-		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
 	if (node == rank)
 		return;
 
-	tag = starpu_mpi_data_get_tag(data_handle);
-	if (tag == -1)
+	data_tag = starpu_mpi_data_get_tag(data_handle);
+	if (data_tag == -1)
 	{
-		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
 	}
 
 	if (me == node)
@@ -350,7 +351,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
-			starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
+			starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
 		}
 	}
 	else if (me == rank)
@@ -360,14 +361,15 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
-			starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
+			starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
 		}
 	}
 }
 
 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 {
-	int me, rank, tag;
+	int me, rank;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	if (rank == -1)
@@ -379,8 +381,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	if (node == rank)
 		return;
 
-	tag = starpu_mpi_data_get_tag(data_handle);
-	if (tag == -1)
+	data_tag = starpu_mpi_data_get_tag(data_handle);
+	if (data_tag == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
@@ -393,7 +395,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
-			starpu_mpi_recv(data_handle, rank, tag, comm, &status);
+			starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
 		}
 	}
 	else if (me == rank)
@@ -403,7 +405,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
-			starpu_mpi_send(data_handle, node, tag, comm);
+			starpu_mpi_send(data_handle, node, data_tag, comm);
 		}
 	}
 }

+ 2 - 0
mpi/src/starpu_mpi_cache.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_cache_stats.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 1 - 1
mpi/src/starpu_mpi_datatype.c

@@ -106,7 +106,7 @@ static int handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datat
 	unsigned ldy = starpu_tensor_get_local_ldy(data_handle);
 	unsigned ldz = starpu_tensor_get_local_ldz(data_handle);
 	unsigned ldt = starpu_tensor_get_local_ldt(data_handle);
-	size_t elemsize = starpu_block_get_elemsize(data_handle);
+	size_t elemsize = starpu_tensor_get_elemsize(data_handle);
 
 	MPI_Datatype datatype_3dlayer;
 	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_3dlayer);

+ 2 - 0
mpi/src/starpu_mpi_datatype.h

@@ -20,6 +20,8 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi_private.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_fxt.h

@@ -22,6 +22,8 @@
 #include <common/config.h>
 #include <common/fxt.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_init.h

@@ -20,6 +20,8 @@
 #include <starpu.h>
 #include <starpu_mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_private.c

@@ -68,4 +68,6 @@ void _starpu_mpi_env_init(void)
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);
 	_starpu_mpi_mem_throttle = starpu_get_env_number_default("STARPU_MPI_MEM_THROTTLE", 0);
+	_starpu_debug_level_min = starpu_get_env_number_default("STARPU_MPI_DEBUG_LEVEL_MIN", 0);
+	_starpu_debug_level_max = starpu_get_env_number_default("STARPU_MPI_DEBUG_LEVEL_MAX", 0);
 }

+ 8 - 3
mpi/src/starpu_mpi_private.h

@@ -27,6 +27,8 @@
 #include <common/starpu_spinlock.h>
 #include <core/simgrid.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -246,7 +248,8 @@ LIST_TYPE(_starpu_mpi_req,
 
 	int ret;
 
-	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
+	/** 0 send, 1 recv */
+	enum _starpu_mpi_request_type request_type;
 
 	unsigned submitted;
 	unsigned completed;
@@ -315,8 +318,10 @@ struct _starpu_mpi_argc_argv
 	int *argc;
 	char ***argv;
 	MPI_Comm comm;
-	int fargc;	// Fortran argc
-	char **fargv;	// Fortran argv
+	/** Fortran argc */
+	int fargc;
+	/** Fortran argv */
+	char **fargv;
 	int rank;
 	int world_size;
 };

+ 2 - 0
mpi/src/starpu_mpi_select_node.h

@@ -19,6 +19,8 @@
 
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_stats.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 5 - 5
mpi/src/starpu_mpi_task_insert.c

@@ -797,15 +797,15 @@ void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 {
 	int me, rank, nb_nodes;
-	starpu_mpi_tag_t tag;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
-	tag = starpu_mpi_data_get_tag(data_handle);
+	data_tag = starpu_mpi_data_get_tag(data_handle);
 	if (rank == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 	}
-	if (tag == -1)
+	if (data_tag == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
@@ -851,7 +851,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				struct _starpu_mpi_redux_data_args *args;
 				_STARPU_MPI_MALLOC(args, sizeof(struct _starpu_mpi_redux_data_args));
 				args->data_handle = data_handle;
-				args->data_tag = tag;
+				args->data_tag = data_tag;
 				args->node = i;
 				args->comm = comm;
 
@@ -878,7 +878,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	else
 	{
 		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
-		starpu_mpi_isend_detached_prio(data_handle, rank, tag, prio, comm, NULL, NULL);
+		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
 		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 	/* FIXME: In order to prevent simultaneous receive submissions

+ 2 - 0
mpi/src/starpu_mpi_task_insert.h

@@ -17,6 +17,8 @@
 #ifndef __STARPU_MPI_TASK_INSERT_H__
 #define __STARPU_MPI_TASK_INSERT_H__
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 5 - 3
mpi/tests/Makefile.am

@@ -24,7 +24,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
@@ -84,9 +84,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 ########################
 # Unit testcases       #
@@ -109,6 +109,7 @@ starpu_mpi_TESTS +=				\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
+	insert_task_tags			\
 	matrix					\
 	matrix2					\
 	mpi_barrier				\
@@ -205,6 +206,7 @@ noinst_PROGRAMS +=				\
 	insert_task_count			\
 	insert_task_dyn_handles			\
 	insert_task_seq				\
+	insert_task_tags			\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_reduction				\

+ 84 - 0
mpi/tests/insert_task_tags.c

@@ -0,0 +1,84 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], void *_args)
+{
+	(void) _args;
+	(void) descr;
+
+	FPRINTF_MPI(stderr, "Hello\n");
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &starpu_perfmodel_nop,
+	.name = "insert_task_tags"
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, err;
+	int x=32;
+	starpu_data_handle_t handle0;
+	starpu_data_handle_t handle1;
+	int64_t *value;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (rank != 0 && rank != 1)
+		goto end;
+
+	starpu_variable_data_register(&handle0, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
+	starpu_variable_data_register(&handle1, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
+
+	starpu_mpi_comm_get_attr(MPI_COMM_WORLD, STARPU_MPI_TAG_UB, &value, &err);
+	assert(err == 1);
+
+	starpu_mpi_data_register(handle0, (*value)-1, 1);
+	starpu_mpi_data_register(handle1, (*value)-2, 1);
+
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_EXECUTE_ON_NODE, 0,
+				     STARPU_RW, handle0,
+				     0);
+	assert(err == 0);
+
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_EXECUTE_ON_NODE, 1,
+				     STARPU_RW, handle1,
+				     0);
+	assert(err == 0);
+
+	FPRINTF_MPI(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle0);
+	starpu_data_unregister(handle1);
+
+end:
+	starpu_mpi_shutdown();
+
+	return 0;
+}
+

+ 1 - 1
mpi/tests/ring.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = loop*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_async.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = loop*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_async_implicit.c

@@ -92,7 +92,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = loop*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_sync.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = loop*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_sync_detached.c

@@ -112,7 +112,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = loop*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 3 - 3
mpi/tests/user_defined_datatype.c

@@ -26,9 +26,9 @@
 #  define ELEMENTS 1000
 #endif
 
-typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
+typedef void (*test_func)(starpu_data_handle_t *, int, int, starpu_mpi_tag_t);
 
-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, starpu_mpi_tag_t tag)
 {
 	int i;
 	(void)rank;
@@ -42,7 +42,7 @@ void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_hand
 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
 }
 
-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, starpu_mpi_tag_t tag)
 {
 	int i;
 

+ 3 - 2
mpi/tools/Makefile.am

@@ -21,9 +21,10 @@ include $(top_srcdir)/starpu.mk
 SUBDIRS =
 
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/mpi/include -I$(top_builddir)/src -I$(top_srcdir)/src -DSTARPU_REPLAY_MPI
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS)
+LIBS += $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 CC=$(CC_OR_MPICC)
 CCLD=$(CC_OR_MPICC)

+ 0 - 0
sc_hypervisor/examples/Makefile.am


Some files were not shown because too many files changed in this diff