5 years ago · ce489e4de4
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
 
				 /build
			
 
				 /build2
			
 
				 /build-aux
			
 
				+/build_starpu
			
 
				+/install
			
 
				 /GPATH
			
 
				 /GRTAGS
			
 
				 /GTAGS
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,7 @@ New features:
 
				   * New number_events.data trace file which monitors number of events in trace
			
 
				     files. This file can be parsed by the new script
			
 
				     starpu_fxt_number_events_to_names.py to convert event keys to event names.
			
 
				+  * New STARPU_PER_WORKER perfmodel.
			
 
				 
			
 
				 Small changes:
			
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
--- a/configure.ac
+++ b/configure.ac
@@ -92,6 +92,7 @@ if test x$enable_perf_debug = xyes; then
 
				     enable_shared=no
			
 
				 fi
			
 
				 default_enable_mpi_check=maybe
			
 
				+default_enable_mpi=maybe
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
@@ -206,6 +207,9 @@ if test x$enable_simgrid = xyes ; then
 
				         # want that by default
			
 
				 	default_enable_mpi_check=no
			
 
				 
			
 
				+	# disable MPI support by default
			
 
				+	default_enable_mpi=no
			
 
				+
			
 
				 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
			
 
				 	AC_LANG_PUSH([C++])
			
 
				 	if test x$enable_shared = xno ; then
			
@@ -270,145 +274,146 @@ fi
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-#                                    MPI                                      #
			
 
				+#                                LIBTOOLS                                     #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
			
 
				-                              [Disable StarPU MPI library generation])],
			
 
				-            [enable_mpi=$enableval],
			
 
				-            [enable_mpi=yes])
			
 
				+#c++11 detection
			
 
				+AX_CXX_COMPILE_STDCXX(11,noext,optional)
			
 
				 
			
 
				-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
			
 
				-                              [Enable StarPU to run with the master-slave mode])],
			
 
				-            use_mpi_master_slave=$enableval,
			
 
				-            use_mpi_master_slave=no)
			
 
				+AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
			
 
				+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
			
 
				+if test $HAVE_CXX11 -eq 1; then
			
 
				+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
			
 
				+fi
			
 
				 
			
 
				-#Check MPICC
			
 
				-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
			
 
				-           [Path of the mpicc compiler])],
			
 
				-   [
			
 
				-       if test x$withval = xyes; then
			
 
				-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
			
 
				-       elif test x$withval = xno ; then
			
 
				-           mpi_requested=no
			
 
				-	   mpicc_path=""
			
 
				-	   use_mpi=no
			
 
				-       else
			
 
				-	   mpi_requested=yes
			
 
				-           mpicc_path=$withval
			
 
				-       fi
			
 
				-   ],
			
 
				-   [
			
 
				-       mpi_requested=maybe
			
 
				-       if test x$enable_simgrid = xyes ; then
			
 
				-           DEFAULT_MPICC=smpicc
			
 
				-       else
			
 
				-           DEFAULT_MPICC=mpicc
			
 
				-       fi
			
 
				-       # nothing was specified: default value is used
			
 
				-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				-   ])
			
 
				+LT_PREREQ([2.2])
			
 
				+LT_INIT([win32-dll])
			
 
				 
			
 
				-# in case MPI was explicitely required, but is not available, this is an error
			
 
				-if test x$mpi_requested = xyes -a ! -x "$mpicc_path"; then
			
 
				-   AC_MSG_ERROR([Compiler MPI not valid])
			
 
				-fi
			
 
				+AC_HEADER_STDC
			
 
				 
			
 
				-if test x$mpi_requested != xno ; then
			
 
				-   # We test if the MPICC compiler exists
			
 
				-     if test ! -x $mpicc_path; then
			
 
				-         #MPICC does not exists or is not executable
			
 
				-	 AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				-	 use_mpi=no
			
 
				-     else
			
 
				-	 use_mpi=yes
			
 
				-	 if test x$enable_simgrid = xyes ; then
			
 
				-             AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
			
 
				-						  [Path of the smpirun helper])],
			
 
				-			 [
			
 
				-			     if test x$withval = xyes; then
			
 
				-				 AC_MSG_ERROR(--with-smpirun must be given a pathname)
			
 
				-			     else
			
 
				-				 smpirun_path=$withval
			
 
				-			     fi
			
 
				-			 ],
			
 
				-			 [
			
 
				-			     # nothing was specified: default value is used
			
 
				-			     AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
			
 
				-			 ])
			
 
				-	 fi
			
 
				-     fi
			
 
				+AC_C_RESTRICT
			
 
				+
			
 
				+# Check if bash is available
			
 
				+AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
			
 
				+
			
 
				+# Record git version
			
 
				+AC_PATH_PROG(gitcommand, git)
			
 
				+if test "$gitcommand" = "" ; then
			
 
				+   if test -f $srcdir/STARPU-REVISION ; then
			
 
				+      cp $srcdir/STARPU-REVISION .
			
 
				+   else
			
 
				+      echo "unknown" > ./STARPU-REVISION
			
 
				+   fi
			
 
				+else
			
 
				+   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
			
 
				 fi
			
 
				 
			
 
				-AC_MSG_CHECKING(mpicc path)
			
 
				+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                           MPI compilers                                     #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+#Check MPICC
			
 
				+if test x$enable_simgrid = xyes ; then
			
 
				+    DEFAULT_MPICC=smpicc
			
 
				+else
			
 
				+    DEFAULT_MPICC=mpicc
			
 
				+fi
			
 
				+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
			
 
				+AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+AC_MSG_CHECKING(whether mpicc is available)
			
 
				 AC_MSG_RESULT($mpicc_path)
			
 
				 AC_SUBST(MPICC, $mpicc_path)
			
 
				 
			
 
				-
			
 
				 #Check MPICXX/MPIC++
			
 
				-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
			
 
				-           [Path of the mpicxx/mpic++ compiler])],
			
 
				-   [
			
 
				-       if test x$withval = xyes; then
			
 
				-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
			
 
				-       else
			
 
				-           mpicxx_path=$withval
			
 
				-       fi
			
 
				-   ],
			
 
				-   [
			
 
				-       if test x$enable_simgrid = xyes ; then
			
 
				-           DEFAULT_MPICXX=smpicxx
			
 
				-       else
			
 
				-           DEFAULT_MPICXX=mpicxx
			
 
				-       fi
			
 
				-       # nothing was specified: default value is used
			
 
				-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+if test x$enable_simgrid = xyes ; then
			
 
				+    DEFAULT_MPICXX=smpicxx
			
 
				+else
			
 
				+    DEFAULT_MPICXX=mpicxx
			
 
				+fi
			
 
				+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
			
 
				+AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				 
			
 
				-       # try with mpic++ if mpicxx was not found
			
 
				-       if test x$mpicxx_path = xno ; then
			
 
				-            DEFAULT_MPICXX=mpic++
			
 
				-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				-       fi
			
 
				-   ])
			
 
				+# try with mpic++ if mpicxx was not found
			
 
				+if test x$mpicxx_path = xno ; then
			
 
				+    DEFAULT_MPICXX=mpic++
			
 
				+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+fi
			
 
				 
			
 
				 # We test if the MPICXX/MPIC++ compiler exists
			
 
				 if test ! -x $mpicxx_path; then
			
 
				-    #MPICXX/MPIC++ does not exists or is not executable
			
 
				     AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
			
 
				-    use_mpicxx=no
			
 
				-else
			
 
				-    use_mpicxx=yes
			
 
				+    mpicxx_path=no
			
 
				 fi
			
 
				 
			
 
				-AC_MSG_CHECKING(mpicxx/mpic++ path)
			
 
				+AC_MSG_CHECKING(whether mpicxx is available)
			
 
				 AC_MSG_RESULT($mpicxx_path)
			
 
				 AC_SUBST(MPICXX, $mpicxx_path)
			
 
				 
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                    MPI                                      #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
			
 
				+                              [Disable StarPU MPI library generation])],
			
 
				+            [enable_mpi=$enableval],
			
 
				+            [enable_mpi=$default_enable_mpi])
			
 
				 
			
 
				-if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
			
 
				-    cc_or_mpicc=$mpicc_path
			
 
				-        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				-        # libstarpumpi.
			
 
				-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				-        # references to MPI_*). We manually add the required flags to fix this
			
 
				-        # issue.
			
 
				-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				-else
			
 
				-    cc_or_mpicc=$CC
			
 
				+if test x$enable_mpi = xmaybe ; then
			
 
				+    if test -x "$mpicc_path"; then
			
 
				+	enable_mpi=yes
			
 
				+    else
			
 
				+	enable_mpi=no
			
 
				+    fi
			
 
				 fi
			
 
				 
			
 
				-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				+# in case MPI was explicitely required, but mpicc is not available, this is an error
			
 
				+if test x$enable_mpi = xyes -a ! -x "$mpicc_path"; then
			
 
				+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
			
 
				+fi
			
 
				 
			
 
				-AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
			
 
				-				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
			
 
				-				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
			
 
				-if test x$enable_mpi_pedantic_isend = xyes; then
			
 
				-	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
			
 
				+build_mpi_lib=$enable_mpi
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                NEW MADELEINE                                #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
			
 
				+		                    [Enable StarPU MPI library generation using the new madeleine backend])],
			
 
				+            [enable_nmad=$enableval],
			
 
				+            [enable_nmad=no])
			
 
				+
			
 
				+build_nmad_lib=no
			
 
				+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				+#We can only build StarPU MPI Library if User wants it and MPI is available
			
 
				+if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
			
 
				+    build_nmad_lib=yes
			
 
				+    build_mpi_lib=no
			
 
				+    PKG_CHECK_MODULES([NMAD],[nmad])
			
 
				+else
			
 
				+    build_nmad_lib=no
			
 
				 fi
			
 
				 
			
 
				-#We can only build MPI Master Slave if User wants it and MPI is available
			
 
				-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                             MPI Master Slave                                #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
			
 
				+                              [Enable StarPU to run with the master-slave mode])],
			
 
				+              use_mpi_master_slave=$enableval,
			
 
				+              use_mpi_master_slave=no)
			
 
				+#We can only build MPI Master Slave if User wants it and MPI compiler are available
			
 
				+if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
			
 
				     build_mpi_master_slave=yes
			
 
				 else
			
 
				     build_mpi_master_slave=no
			
@@ -417,7 +422,9 @@ fi
 
				 #users cannot use both at the same time
			
 
				 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
			
 
				     AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
			
 
				-	enable_mpi=no
			
 
				+    build_mpi_lib=no
			
 
				+    build_nmad_lib=no
			
 
				+    enable_mpi=no
			
 
				 fi
			
 
				 
			
 
				 if test x$build_mpi_master_slave = xyes; then
			
@@ -449,95 +456,19 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 
				 AC_MSG_RESULT($nmaxmpidev)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
			
 
				 
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				-#                                NEW MADELEINE                                #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-
			
 
				-AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
			
 
				-                              [Enable StarPU MPI library generation using new madeleine instead of mpi])],
			
 
				-            [enable_nmad=$enableval],
			
 
				-            [enable_nmad=no])
			
 
				-
			
 
				-if test x$use_mpi = xyes -a \( x$enable_nmad \) ; then
			
 
				-    cc_or_mpicc=$mpicc_path
			
 
				-        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				-        # libstarpumpi.
			
 
				-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				-        # references to MPI_*). We manually add the required flags to fix this
			
 
				-        # issue.
			
 
				-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				-else
			
 
				-    cc_or_mpicc=$CC
			
 
				-fi
			
 
				-
			
 
				-build_nmad_lib=no
			
 
				-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				-#We can only build StarPU MPI Library if User wants it and MPI is available
			
 
				-if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
			
 
				-    build_nmad_lib=yes
			
 
				-    enable_mpi=no
			
 
				-    PKG_CHECK_MODULES([NMAD],[nmad])
			
 
				-else
			
 
				-    build_nmad_lib=no
			
 
				-fi
			
 
				-
			
 
				-# in case NMAD was explicitely required, but the compiler MPI, this is an error
			
 
				-if test x$enable_nmad = xyes -a ! -x "$mpicc_path"; then
			
 
				-   AC_MSG_ERROR([Compiler MPI not valid])
			
 
				-fi
			
 
				-
			
 
				-
			
 
				-AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
			
 
				-AC_MSG_RESULT($build_nmad_lib)
			
 
				-
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-#                                LIBTOOLS                                     #
			
 
				+#                       Miscellaneous things for MPI                          #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				-#c++11 detection
			
 
				-AX_CXX_COMPILE_STDCXX(11,noext,optional)
			
 
				-
			
 
				-AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
			
 
				-AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
			
 
				-if test $HAVE_CXX11 -eq 1; then
			
 
				-  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
			
 
				-fi
			
 
				-
			
 
				-LT_PREREQ([2.2])
			
 
				-LT_INIT([win32-dll])
			
 
				-
			
 
				-AC_HEADER_STDC
			
 
				-
			
 
				-AC_C_RESTRICT
			
 
				-
			
 
				-# Check if bash is available
			
 
				-AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
			
 
				-
			
 
				-# Record git version
			
 
				-AC_PATH_PROG(gitcommand, git)
			
 
				-if test "$gitcommand" = "" ; then
			
 
				-   if test -f $srcdir/STARPU-REVISION ; then
			
 
				-      cp $srcdir/STARPU-REVISION .
			
 
				-   else
			
 
				-      echo "unknown" > ./STARPU-REVISION
			
 
				-   fi
			
 
				-else
			
 
				-   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
			
 
				+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
			
 
				+				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
			
 
				+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
			
 
				+if test x$enable_mpi_pedantic_isend = xyes; then
			
 
				+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
			
 
				 fi
			
 
				 
			
 
				-AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				-#                       Miscellaneous things for MPI                          #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-
			
 
				 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
			
 
				 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
			
 
				 	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
			
@@ -551,68 +482,45 @@ fi
 
				 if test x$enable_mpi_check = xno ; then
			
 
				     running_mpi_check=no
			
 
				 fi
			
 
				+if test x$enable_mpi = xno ; then
			
 
				+    running_mpi_check=no
			
 
				+fi
			
 
				 
			
 
				-
			
 
				-if test x$enable_simgrid = xno ; then
			
 
				+if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
			
 
				     # Check if mpiexec is available
			
 
				-    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
			
 
				-                [Path of mpiexec])],
			
 
				-        [
			
 
				-            if test x$withval = xyes; then
			
 
				-                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
			
 
				-            else
			
 
				-                mpiexec_path=$withval
			
 
				-            fi
			
 
				-        ],
			
 
				-        [
			
 
				-            # nothing was specified: look in the path
			
 
				-	    if test x$mpicc_path = x ; then
			
 
				-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$PATH])
			
 
				-	    else
			
 
				-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-	    fi
			
 
				-        ])
			
 
				-
			
 
				+    if test x$enable_simgrid = xyes ; then
			
 
				+	DEFAULT_MPIEXEC=smpirun
			
 
				+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
			
 
				+	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+    else
			
 
				+	DEFAULT_MPIEXEC=mpiexec
			
 
				+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
			
 
				+	if test x$mpicc_path = x ; then
			
 
				+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
			
 
				+	else
			
 
				+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
			
 
				+	fi
			
 
				+    fi
			
 
				     AC_MSG_CHECKING(whether mpiexec is available)
			
 
				     AC_MSG_RESULT($mpiexec_path)
			
 
				 
			
 
				     # We test if MPIEXEC exists
			
 
				     if test ! -x $mpiexec_path; then
			
 
				-        # if it's not valid, it could be the parameter given to configure.ac was not a full path, let's look for it
			
 
				-	if test x$mpicc_path = x ; then
			
 
				-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$PATH])
			
 
				-	else
			
 
				-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-	fi
			
 
				-        AC_MSG_CHECKING(whether mpiexec is available (2nd try))
			
 
				-        AC_MSG_RESULT($mpiexec_path_bis)
			
 
				-	if test -x $mpiexec_path_bis; then
			
 
				-	   mpiexec_path=$mpiexec_path_bis
			
 
				-	else
			
 
				-           #MPIEXEC does not exists or is not executable
			
 
				-           AC_MSG_RESULT(The mpiexec script is not valid)
			
 
				-           running_mpi_check=no
			
 
				-           mpiexec_path=""
			
 
				-	fi
			
 
				+        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
			
 
				+        running_mpi_check=no
			
 
				+        mpiexec_path=""
			
 
				     fi
			
 
				     AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				 fi
			
 
				 
			
 
				 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
			
 
				-if test x$use_mpi = xyes ; then
			
 
				-    AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				-    AC_MSG_RESULT($running_mpi_check)
			
 
				-fi
			
 
				-
			
 
				-#We can only build StarPU MPI Library if User wants it and MPI is available
			
 
				-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
			
 
				-    build_mpi_lib=yes
			
 
				-else
			
 
				-    build_mpi_lib=no
			
 
				-fi
			
 
				+AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				+AC_MSG_RESULT($running_mpi_check)
			
 
				 
			
 
				 AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
			
 
				 AC_MSG_RESULT($build_mpi_lib)
			
 
				+AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
			
 
				+AC_MSG_RESULT($build_nmad_lib)
			
 
				 
			
 
				 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes)
			
 
				 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
			
@@ -622,11 +530,9 @@ if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
 
				 	else
			
 
				 		AC_DEFINE(STARPU_USE_MPI_NMAD,[1],[whether the StarPU MPI library (with a NewMadeleine implementation) is available])
			
 
				 	fi
			
 
				-else
			
 
				-	running_mpi_check=no
			
 
				 fi
			
 
				 
			
 
				-if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
			
 
				+if test x$enable_mpi = xyes ; then
			
 
				     if test x$enable_simgrid = xyes ; then
			
 
				         if test x$enable_shared = xyes ; then
			
 
				 	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
			
@@ -644,17 +550,16 @@ AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 
				 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
			
 
				 
			
 
				 AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
			
 
				-			[Arguments for mpiexec])],
			
 
				-	[
			
 
				+					  [Arguments for mpiexec])],
			
 
				+	    [
			
 
				 		mpiexec_args=$withval
			
 
				-	])
			
 
				+	    ])
			
 
				 AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
			
 
				 
			
 
				-
			
 
				 AC_MSG_CHECKING(whether MPI debug messages should be displayed)
			
 
				 AC_ARG_ENABLE(mpi-verbose, [AS_HELP_STRING([--enable-mpi-verbose],
			
 
				-			[display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
			
 
				-			enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
			
 
				+					   [display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
			
 
				+	      enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
			
 
				 AC_MSG_RESULT($enable_mpi_verbose)
			
 
				 if test x$enable_mpi_verbose = xyes; then
			
 
				 	AC_DEFINE(STARPU_MPI_VERBOSE, [1], [display MPI verbose debug messages])
			
@@ -664,6 +569,19 @@ if test x$enable_mpi_verbose = xextra; then
 
				 	AC_DEFINE(STARPU_MPI_EXTRA_VERBOSE, [1], [display MPI verbose debug messages])
			
 
				 fi
			
 
				 
			
 
				+if test x$enable_mpi = xyes -o x$build_mpi_master_slave = xyes ; then
			
 
				+    cc_or_mpicc=$mpicc_path
			
 
				+    # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				+    # libstarpumpi.
			
 
				+    # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				+    # references to MPI_*). We manually add the required flags to fix this
			
 
				+    # issue.
			
 
				+    AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				+else
			
 
				+    cc_or_mpicc=$CC
			
 
				+fi
			
 
				+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                           MIC device compilation                            #
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -602,7 +602,7 @@ whole machine, it would not be efficient to accumulate them in only one place,
 
				 incurring data transmission each and access concurrency.
			
 
				 
			
 
				 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
			
 
				-this case: it will allocate a buffer on each memory node, and accumulate
			
 
				+this case: it will allocate a buffer on each worker (lazily), and accumulate
			
 
				 intermediate results there. When the data is eventually accessed in the normal
			
 
				 mode ::STARPU_R, StarPU will collect the intermediate results in just one
			
 
				 buffer.
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -190,9 +190,10 @@ single task gives the consumption of the task in Joules, which can be given to
 
				 starpu_perfmodel_update_history().
			
 
				 
			
 
				 Another way to provide the energy performance is to define a
			
 
				-perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
			
 
				-starpu_perfmodel::arch_cost_function field to a function which shall return the
			
 
				-estimated consumption of the task in Joules. Such a function can for instance
			
 
				+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH or
			
 
				+::STARPU_PER_WORKER , and set the starpu_perfmodel::arch_cost_function or
			
 
				+starpu_perfmodel::worker_cost_function field to a function which shall return
			
 
				+the estimated consumption of the task in Joules. Such a function can for instance
			
 
				 use starpu_task_expected_length() on the task (in µs), multiplied by the
			
 
				 typical power consumption of the device, e.g. in W, and divided by 1000000. to
			
 
				 get Joules.
			
--- a/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
+++ b/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
@@ -45,7 +45,8 @@ provides a complete list of the functions available for writing advanced schedul
 
				 This includes getting an estimation for a task computation completion with
			
 
				 starpu_task_expected_length(), for the required data transfers with
			
 
				 starpu_task_expected_data_transfer_time_for(), for the required energy with
			
 
				-starpu_task_expected_energy(), etc. Other
			
 
				+starpu_task_expected_energy(), etc. Per-worker variants are also available with
			
 
				+starpu_task_worker_expected_length(), etc. Other
			
 
				 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
			
 
				 starpu_transfer_predict(), ...
			
 
				 One can also directly test the presence of a data handle with starpu_data_is_on_node().
			
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -401,6 +401,11 @@ filled with pointers to functions which return the expected duration
 
				 of the task in micro-seconds, one per architecture, see for instance
			
 
				 <c>tests/datawizard/locality.c</c>
			
 
				 </li>
			
 
				+
			
 
				+<li>
			
 
				+Provided explicitly by the application (model type ::STARPU_PER_WORKER)
			
 
				+similarly with the starpu_perfmodel::worker_cost_function field.
			
 
				+</li>
			
 
				 </ul>
			
 
				 
			
 
				 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -165,6 +165,7 @@ struct starpu_perfmodel_per_arch
 
				 enum starpu_perfmodel_type
			
 
				 {
			
 
				         STARPU_PERFMODEL_INVALID=0,
			
 
				+	STARPU_PER_WORKER,                /**< Application-provided per-worker cost model function */
			
 
				 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
			
 
				 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
			
 
				 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
			
@@ -226,11 +227,17 @@ struct starpu_perfmodel
 
				 	*/
			
 
				 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
			
 
				 	/**
			
 
				-	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
			
 
				+	   Used by ::STARPU_PER_ARCH. Take a task, an arch and implementation
			
 
				 	   number, and must return a task duration estimation in
			
 
				 	   micro-seconds on that arch.
			
 
				 	*/
			
 
				 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
			
 
				+	/**
			
 
				+	   Used by ::STARPU_PER_WORKER. Take a task, a worker id and implementation
			
 
				+	   number, and must return a task duration estimation in
			
 
				+	   micro-seconds on that worker.
			
 
				+	*/
			
 
				+	double (*worker_cost_function)(struct starpu_task *, unsigned workerid, unsigned nimpl);
			
 
				 
			
 
				 	/**
			
 
				 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -110,6 +110,10 @@ struct starpu_sched_policy
 
				 	   to be executed by the worker. This method therefore permits
			
 
				 	   to keep the state of the scheduler coherent even when
			
 
				 	   StarPU bypasses the scheduling strategy.
			
 
				+
			
 
				+	   Note: to get an estimation of the task duration, \p perf_workerid
			
 
				+	   needs to be used rather than \p workerid, for the case of parallel
			
 
				+	   tasks.
			
 
				 	*/
			
 
				 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
			
 
				 
			
@@ -366,6 +370,11 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task);
 
				 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
			
 
				 
			
 
				 /**
			
 
				+   Same as starpu_task_expected_length() but for a precise worker.
			
 
				+*/
			
 
				+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
			
 
				+
			
 
				+/**
			
 
				    Return an estimated speedup factor relative to CPU speed
			
 
				 */
			
 
				 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
			
@@ -395,6 +404,11 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 
				 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
			
 
				 
			
 
				 /**
			
 
				+   Same as starpu_task_expected_energy but for a precise worker
			
 
				+*/
			
 
				+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
			
 
				+
			
 
				+/**
			
 
				    Return expected conversion time in ms (multiformat interface only)
			
 
				 */
			
 
				 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -50,9 +50,9 @@ extern "C"
 
				 int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
			
 
				 
			
 
				 /**
			
 
				-   Same as starpu_mpi_init_conf(), except that this does not
			
 
				-   initialize the StarPU library. The caller thus has to call
			
 
				-   starpu_init() before this.
			
 
				+   Same as starpu_mpi_init_conf(), except that this does not initialize the
			
 
				+   StarPU library. The caller thus has to call starpu_init() before this, and it
			
 
				+   can not reserve a core for the MPI communications.
			
 
				 */
			
 
				 int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
			
 
				 
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -137,7 +137,13 @@ starpu_mpi_TESTS +=				\
 
				 	temporary				\
			
 
				 	user_defined_datatype			\
			
 
				 	early_stuff				\
			
 
				-	sendrecv_bench
			
 
				+	sendrecv_bench				\
			
 
				+	sendrecv_parallel_tasks_bench
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+starpu_mpi_TESTS +=				\
			
 
				+	sendrecv_gemm_bench
			
 
				+endif
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				 # missing support in simgrid
			
@@ -226,7 +232,9 @@ noinst_PROGRAMS =				\
 
				 	starpu_redefine				\
			
 
				 	load_balancer				\
			
 
				 	driver					\
			
 
				-	sendrecv_bench
			
 
				+	sendrecv_bench				\
			
 
				+	sendrecv_gemm_bench			\
			
 
				+	sendrecv_parallel_tasks_bench
			
 
				 
			
 
				 XFAIL_TESTS=					\
			
 
				 	policy_register_toomany			\
			
@@ -256,4 +264,22 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 
				 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
			
 
				 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
			
 
				 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
			
 
				+
			
 
				+sendrecv_bench_SOURCES = sendrecv_bench.c
			
 
				+sendrecv_bench_SOURCES += bench_helper.c
			
 
				+sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				+
			
 
				+sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
			
 
				+sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
			
 
				+sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
			
 
				+sendrecv_gemm_bench_SOURCES += bench_helper.c
			
 
				+sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				+sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
			
 
				+
			
 
				+sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				+endif
			
 
				+
			
 
				 endif
			
--- a/mpi/tests/abstract_sendrecv_bench.c
+++ b/mpi/tests/abstract_sendrecv_bench.c
@@ -0,0 +1,136 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "bench_helper.h"
			
 
				+#include "abstract_sendrecv_bench.h"
			
 
				+
			
 
				+
			
 
				+
			
 
				+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
			
 
				+{
			
 
				+	uint64_t iterations = LOOPS_DEFAULT;
			
 
				+
			
 
				+	if (mpi_rank >= 2)
			
 
				+	{
			
 
				+		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+		{
			
 
				+			iterations = bench_nb_iterations(iterations, s);
			
 
				+
			
 
				+			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+			for (uint64_t j = 0; j < iterations; j++)
			
 
				+			{
			
 
				+				starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (mpi_rank == 0)
			
 
				+	{
			
 
				+		printf("Times in us\n");
			
 
				+		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
			
 
				+	}
			
 
				+
			
 
				+	int array_size = 0;
			
 
				+	starpu_data_handle_t handle_send, handle_recv;
			
 
				+	float* vector_send = NULL;
			
 
				+	float* vector_recv = NULL;
			
 
				+	double t1, t2, global_tstart, global_tend;
			
 
				+	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
			
 
				+
			
 
				+	if (thread_barrier != NULL)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
			
 
				+	}
			
 
				+
			
 
				+	global_tstart = starpu_timing_now();
			
 
				+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+	{
			
 
				+		vector_send = malloc(s);
			
 
				+		vector_recv = malloc(s);
			
 
				+		memset(vector_send, 0, s);
			
 
				+		memset(vector_recv, 0, s);
			
 
				+
			
 
				+		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
			
 
				+		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
			
 
				+
			
 
				+		iterations = bench_nb_iterations(iterations, s);
			
 
				+
			
 
				+		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+		for (uint64_t j = 0; j < iterations; j++)
			
 
				+		{
			
 
				+			if (mpi_rank == 0)
			
 
				+			{
			
 
				+				t1 = starpu_timing_now();
			
 
				+				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
			
 
				+				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
			
 
				+				t2 = starpu_timing_now();
			
 
				+
			
 
				+				const double t = (t2 -t1) / 2;
			
 
				+
			
 
				+				lats[j] = t;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
			
 
				+				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
			
 
				+			}
			
 
				+
			
 
				+			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+		}
			
 
				+
			
 
				+		if (mpi_rank == 0)
			
 
				+		{
			
 
				+			qsort(lats, iterations, sizeof(double), &comp_double);
			
 
				+
			
 
				+			const double min_lat = lats[0];
			
 
				+			const double max_lat = lats[iterations - 1];
			
 
				+			const double med_lat = lats[(iterations - 1) / 2];
			
 
				+			const double d1_lat = lats[(iterations - 1) / 10];
			
 
				+			const double d9_lat = lats[9 * (iterations - 1) / 10];
			
 
				+			double avg_lat = 0.0;
			
 
				+
			
 
				+			for(uint64_t k = 0; k < iterations; k++)
			
 
				+			{
			
 
				+				avg_lat += lats[k];
			
 
				+			}
			
 
				+
			
 
				+			avg_lat /= iterations;
			
 
				+			const double bw_million_byte = s / min_lat;
			
 
				+			const double bw_mbyte        = bw_million_byte / 1.048576;
			
 
				+
			
 
				+			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
			
 
				+				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
			
 
				+			fflush(stdout);
			
 
				+		}
			
 
				+		starpu_data_unregister(handle_recv);
			
 
				+		starpu_data_unregister(handle_send);
			
 
				+
			
 
				+		free(vector_send);
			
 
				+		free(vector_recv);
			
 
				+	}
			
 
				+	global_tend = starpu_timing_now();
			
 
				+
			
 
				+	if (mpi_rank == 0)
			
 
				+	{
			
 
				+		printf("Comm bench took %9.3lf ms\n", (global_tend - global_tstart) / 1000);
			
 
				+	}
			
 
				+
			
 
				+	free(lats);
			
 
				+}
			
--- a/mpi/tests/abstract_sendrecv_bench.h
+++ b/mpi/tests/abstract_sendrecv_bench.h
@@ -0,0 +1,21 @@
 
				+
			
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+
			
 
				+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);
			
--- a/mpi/tests/bench_helper.c
+++ b/mpi/tests/bench_helper.c
@@ -0,0 +1,62 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "bench_helper.h"
			
 
				+
			
 
				+
			
 
				+int comp_double(const void*_a, const void*_b)
			
 
				+{
			
 
				+	const double* a = _a;
			
 
				+	const double* b = _b;
			
 
				+
			
 
				+	if(*a < *b)
			
 
				+		return -1;
			
 
				+	else if(*a > *b)
			
 
				+		return 1;
			
 
				+	else
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+uint64_t bench_next_size(uint64_t len)
			
 
				+{
			
 
				+	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
			
 
				+
			
 
				+	if(next <= len)
			
 
				+		next++;
			
 
				+
			
 
				+	return next;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+uint64_t bench_nb_iterations(int iterations, uint64_t len)
			
 
				+{
			
 
				+	const uint64_t max_data = NX_MAX;
			
 
				+
			
 
				+	if(len <= 0)
			
 
				+		len = 1;
			
 
				+
			
 
				+	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
			
 
				+
			
 
				+	if(data_size  > max_data)
			
 
				+	{
			
 
				+		iterations = (max_data / (uint64_t)len);
			
 
				+		if(iterations < 2)
			
 
				+			iterations = 2;
			
 
				+	}
			
 
				+
			
 
				+	return iterations;
			
 
				+}
			
--- a/mpi/tests/bench_helper.h
+++ b/mpi/tests/bench_helper.h
@@ -0,0 +1,37 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				+#define NX_MIN 0
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define MULT_DEFAULT 4
			
 
				+#else
			
 
				+#define MULT_DEFAULT 2
			
 
				+#endif
			
 
				+#define INCR_DEFAULT 0
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define LOOPS_DEFAULT 100
			
 
				+#else
			
 
				+#define LOOPS_DEFAULT 100000
			
 
				+#endif
			
 
				+
			
 
				+int comp_double(const void*_a, const void*_b);
			
 
				+uint64_t bench_next_size(uint64_t len);
			
 
				+uint64_t bench_nb_iterations(int iterations, uint64_t len);
			
--- a/mpi/tests/sendrecv_bench.c
+++ b/mpi/tests/sendrecv_bench.c
@@ -18,84 +18,15 @@
 
				  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
			
 
				  */
			
 
				 
			
 
				-#include <math.h>
			
 
				 #include <starpu_mpi.h>
			
 
				 #include "helper.h"
			
 
				+#include "abstract_sendrecv_bench.h"
			
 
				 
			
 
				-#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				-#define NX_MIN 0
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-#define MULT_DEFAULT 4
			
 
				-#else
			
 
				-#define MULT_DEFAULT 2
			
 
				-#endif
			
 
				-#define INCR_DEFAULT 0
			
 
				-#define NX_STEP 1.4 // multiplication
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-#define LOOPS_DEFAULT 100
			
 
				-#else
			
 
				-#define LOOPS_DEFAULT 10000
			
 
				-#endif
			
 
				-
			
 
				-int times_nb_nodes;
			
 
				-int times_size;
			
 
				-int worldsize;
			
 
				-
			
 
				-static int comp_double(const void*_a, const void*_b)
			
 
				-{
			
 
				-	const double* a = _a;
			
 
				-	const double* b = _b;
			
 
				-
			
 
				-	if(*a < *b)
			
 
				-		return -1;
			
 
				-	else if(*a > *b)
			
 
				-		return 1;
			
 
				-	else
			
 
				-		return 0;
			
 
				-}
			
 
				-
			
 
				-static inline uint64_t _next(uint64_t len, double multiplier, uint64_t increment)
			
 
				-{
			
 
				-	uint64_t next = len * multiplier + increment;
			
 
				-
			
 
				-	if(next <= len)
			
 
				-		next++;
			
 
				-
			
 
				-	return next;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static inline uint64_t _iterations(int iterations, uint64_t len)
			
 
				-{
			
 
				-	const uint64_t max_data = 512 * 1024 * 1024;
			
 
				-
			
 
				-	if(len <= 0)
			
 
				-		len = 1;
			
 
				-
			
 
				-	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
			
 
				-
			
 
				-	if(data_size  > max_data)
			
 
				-	{
			
 
				-		iterations = (max_data / (uint64_t)len);
			
 
				-		if(iterations < 2)
			
 
				-			iterations = 2;
			
 
				-	}
			
 
				-
			
 
				-	return iterations;
			
 
				-}
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, rank;
			
 
				-	starpu_data_handle_t handle_send, handle_recv;
			
 
				+	int ret, rank, worldsize;
			
 
				 	int mpi_init;
			
 
				-	float* vector_send = NULL;
			
 
				-	float* vector_recv = NULL;
			
 
				-	double t1, t2;
			
 
				-	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
			
 
				-	uint64_t iterations = LOOPS_DEFAULT;
			
 
				-	double multiplier = MULT_DEFAULT;
			
 
				-	uint64_t increment = INCR_DEFAULT;
			
 
				 
			
 
				 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
@@ -115,108 +46,15 @@ int main(int argc, char **argv)
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
 
				-	if (rank >= 2)
			
 
				-	{
			
 
				-		starpu_pause();
			
 
				-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
			
 
				-		{
			
 
				-			iterations = _iterations(iterations, s);
			
 
				-
			
 
				-			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				-			for (uint64_t j = 0; j < iterations; j++)
			
 
				-			{
			
 
				-				starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-			}
			
 
				-		}
			
 
				-		starpu_resume();
			
 
				-
			
 
				-		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-	{
			
 
				-		printf("Times in us\n");
			
 
				-		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
			
 
				-	}
			
 
				-
			
 
				-	int array_size = 0;
			
 
				-
			
 
				-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
			
 
				-	{
			
 
				-		vector_send = malloc(s);
			
 
				-		vector_recv = malloc(s);
			
 
				-		memset(vector_send, 0, s);
			
 
				-		memset(vector_recv, 0, s);
			
 
				-
			
 
				-		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
			
 
				-		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
			
 
				-
			
 
				-		iterations = _iterations(iterations, s);
			
 
				+	/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
			
 
				+	starpu_pause();
			
 
				 
			
 
				-		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				-		for (uint64_t j = 0; j < iterations; j++)
			
 
				-		{
			
 
				-			if (rank == 0)
			
 
				-			{
			
 
				-				t1 = starpu_timing_now();
			
 
				-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
			
 
				-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
			
 
				-				t2 = starpu_timing_now();
			
 
				-
			
 
				-				const double delay = t2 - t1;
			
 
				-				const double t = delay / 2;
			
 
				-
			
 
				-				lats[j] = t;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
			
 
				-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
			
 
				-			}
			
 
				-
			
 
				-			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-		}
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			qsort(lats, iterations, sizeof(double), &comp_double);
			
 
				-
			
 
				-			const double min_lat = lats[0];
			
 
				-			const double max_lat = lats[iterations - 1];
			
 
				-			const double med_lat = lats[(iterations - 1) / 2];
			
 
				-			const double d1_lat = lats[(iterations - 1) / 10];
			
 
				-			const double d9_lat = lats[9 * (iterations - 1) / 10];
			
 
				-			double avg_lat = 0.0;
			
 
				-
			
 
				-			for(uint64_t k = 0; k < iterations; k++)
			
 
				-			{
			
 
				-				avg_lat += lats[k];
			
 
				-			}
			
 
				-
			
 
				-			avg_lat /= iterations;
			
 
				-			const double bw_million_byte = s / min_lat;
			
 
				-			const double bw_mbyte        = bw_million_byte / 1.048576;
			
 
				-
			
 
				-			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
			
 
				-				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
			
 
				-			fflush(stdout);
			
 
				-		}
			
 
				-		starpu_data_unregister(handle_recv);
			
 
				-		starpu_data_unregister(handle_send);
			
 
				-
			
 
				-		free(vector_send);
			
 
				-		free(vector_recv);
			
 
				-	}
			
 
				+	sendrecv_bench(rank, NULL);
			
 
				 
			
 
				+	starpu_resume();
			
 
				 	starpu_mpi_shutdown();
			
 
				 	if (!mpi_init)
			
 
				 		MPI_Finalize();
			
 
				 
			
 
				-	free(lats);
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -0,0 +1,463 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Simple *not distributed* parallel GEMM implementation and sendrecv bench at the same time.
			
 
				+ *
			
 
				+ * This bench is a merge of mpi/tests/sendrecv_bench and examples/mult/sgemm
			
 
				+ *
			
 
				+ * A *non-distributed* GEMM is computed on each node, while a sendrecv bench is running,
			
 
				+ * completely independently. The goal is to measure the impact of worker computations on
			
 
				+ * communications.
			
 
				+ *
			
 
				+ * Use the -nblocks parameter to define the matrix size (matrix size = nblocks * 320), such as
			
 
				+ * the GEMM finishes after the sendrecv bench.
			
 
				+ */
			
 
				+#include <limits.h>
			
 
				+#include <string.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_fxt.h>
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+
			
 
				+#include "helper.h"
			
 
				+#include "abstract_sendrecv_bench.h"
			
 
				+#include "../../examples/mult/simple.h"
			
 
				+
			
 
				+#define CHECK_TASK_SUBMIT(ret) do {				\
			
 
				+	if (ret == -ENODEV)					\
			
 
				+	{							\
			
 
				+		ret = 77;					\
			
 
				+		goto enodev;					\
			
 
				+	}							\
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
			
 
				+} while(0)
			
 
				+
			
 
				+static int mpi_rank;
			
 
				+static int comm_thread_cpuid = -1;
			
 
				+static unsigned nslices = 4;
			
 
				+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
			
 
				+static unsigned matrix_dim = 256;
			
 
				+#else
			
 
				+static unsigned matrix_dim = 320 * 4;
			
 
				+#endif
			
 
				+static unsigned check = 0;
			
 
				+
			
 
				+static TYPE *A, *B, *C;
			
 
				+static starpu_data_handle_t A_handle, B_handle, C_handle;
			
 
				+
			
 
				+static starpu_pthread_barrier_t thread_barrier;
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+#define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
			
 
				+
			
 
				+static void check_output(void)
			
 
				+{
			
 
				+	/* compute C = C - AB */
			
 
				+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
			
 
				+
			
 
				+	/* make sure C = 0 */
			
 
				+	TYPE err;
			
 
				+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
			
 
				+
			
 
				+	if (err < matrix_dim*matrix_dim*0.001)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Results are OK\n");
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int max;
			
 
				+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
			
 
				+
			
 
				+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
			
 
				+		FPRINTF(stderr, "Max error : %e\n", C[max]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void init_problem_data(void)
			
 
				+{
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	unsigned i,j;
			
 
				+#endif
			
 
				+
			
 
				+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	/* fill the matrices */
			
 
				+	for (j=0; j < matrix_dim; j++)
			
 
				+	{
			
 
				+		for (i=0; i < matrix_dim; i++)
			
 
				+		{
			
 
				+			A[j+i*matrix_dim] = (TYPE)(starpu_drand48());
			
 
				+			B[j+i*matrix_dim] = (TYPE)(starpu_drand48());
			
 
				+			C[j+i*matrix_dim] = (TYPE)(0);
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void partition_mult_data(void)
			
 
				+{
			
 
				+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
			
 
				+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
			
 
				+
			
 
				+	struct starpu_data_filter vert;
			
 
				+	memset(&vert, 0, sizeof(vert));
			
 
				+	vert.filter_func = starpu_matrix_filter_vertical_block;
			
 
				+	vert.nchildren = nslices;
			
 
				+
			
 
				+	struct starpu_data_filter horiz;
			
 
				+	memset(&horiz, 0, sizeof(horiz));
			
 
				+	horiz.filter_func = starpu_matrix_filter_block;
			
 
				+	horiz.nchildren = nslices;
			
 
				+
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				+
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cpu_init_matrix_random(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	{
			
 
				+		subA[i] = (TYPE) (starpu_drand48());
			
 
				+		subB[i] = (TYPE) (starpu_drand48());
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cpu_init_matrix_zero(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	for (unsigned i = 0; i < nx *ny; i++)
			
 
				+	{
			
 
				+		subA[i] = (TYPE) (0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cpu_mult(void *descr[], void *arg)
			
 
				+{
			
 
				+	(void)arg;
			
 
				+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+	int worker_size = starpu_combined_worker_get_size();
			
 
				+
			
 
				+	if (worker_size == 1)
			
 
				+	{
			
 
				+		/* Sequential CPU task */
			
 
				+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Parallel CPU task */
			
 
				+		unsigned rank = starpu_combined_worker_get_rank();
			
 
				+
			
 
				+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
			
 
				+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
			
 
				+
			
 
				+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
			
 
				+
			
 
				+		TYPE *new_subB = &subB[block_size*rank];
			
 
				+		TYPE *new_subC = &subC[block_size*rank];
			
 
				+
			
 
				+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel starpu_gemm_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = STARPU_GEMM_STR(gemm)
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_mult},
			
 
				+	.cpu_funcs_name = {"cpu_mult"},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				+	.model = &starpu_gemm_model
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_init_matrix_random =
			
 
				+{
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_init_matrix_random},
			
 
				+	.cpu_funcs_name = {"cpu_init_matrix_random"},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl_init_matrix_zero =
			
 
				+{
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_funcs = {cpu_init_matrix_zero},
			
 
				+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			nslices = strtol(argv[++i], &argptr, 10);
			
 
				+			matrix_dim = 320 * nslices;
			
 
				+		}
			
 
				+
			
 
				+		else if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
			
 
				+			if (matrix_dim_tmp % 320 != 0)
			
 
				+			{
			
 
				+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				matrix_dim = matrix_dim_tmp;
			
 
				+				nslices = matrix_dim / 320;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		else if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				+			check = 1;
			
 
				+		}
			
 
				+
			
 
				+		else if (strcmp(argv[i], "-spmd") == 0)
			
 
				+		{
			
 
				+			cl.type = STARPU_SPMD;
			
 
				+		}
			
 
				+
			
 
				+		else if (strcmp(argv[i], "-comm-thread-cpuid") == 0)
			
 
				+		{
			
 
				+			comm_thread_cpuid = atoi(argv[++i]);
			
 
				+		}
			
 
				+
			
 
				+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm_thread_cpuid cpuid]\n", argv[0]);
			
 
				+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks\n", matrix_dim, nslices);
			
 
				+			fprintf(stderr, "Use -comm_thread_cpuid to specifiy where to bind the comm benchmarking thread\n");
			
 
				+			exit(EXIT_SUCCESS);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(stderr,"Unrecognized option %s", argv[i]);
			
 
				+			exit(EXIT_FAILURE);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void* comm_thread_func(void* arg)
			
 
				+{
			
 
				+	if (comm_thread_cpuid < 0)
			
 
				+	{
			
 
				+		comm_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_bind_thread_on(comm_thread_cpuid, 0, "Comm") < 0)
			
 
				+	{
			
 
				+		char hostname[65];
			
 
				+		gethostname(hostname, sizeof(hostname));
			
 
				+		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
			
 
				+	}
			
 
				+
			
 
				+	sendrecv_bench(mpi_rank, &thread_barrier);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	double start, end;
			
 
				+	int ret, mpi_init, worldsize;
			
 
				+	starpu_pthread_t comm_thread;
			
 
				+
			
 
				+	char hostname[255];
			
 
				+	gethostname(hostname, 255);
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	starpu_fxt_autostart_profiling(0);
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+
			
 
				+	if (worldsize < 2)
			
 
				+	{
			
 
				+		if (mpi_rank == 0)
			
 
				+			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				+
			
 
				+		starpu_mpi_shutdown();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	STARPU_PTHREAD_BARRIER_INIT(&thread_barrier, NULL, 2);
			
 
				+
			
 
				+
			
 
				+	// Start comm thread, benchmarking sendrecv:
			
 
				+	STARPU_PTHREAD_CREATE(&comm_thread, NULL, comm_thread_func, NULL);
			
 
				+
			
 
				+
			
 
				+	// Main thread will submit GEMM tasks:
			
 
				+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	partition_mult_data();
			
 
				+
			
 
				+
			
 
				+	if (mpi_rank == 0)
			
 
				+	{
			
 
				+		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
			
 
				+	}
			
 
				+
			
 
				+	starpu_pause();
			
 
				+
			
 
				+	unsigned x, y;
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	// Initialize matrices:
			
 
				+	for (x = 0; x < nslices; x++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl = &cl_init_matrix_random;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		CHECK_TASK_SUBMIT(ret);
			
 
				+
			
 
				+		for (y = 0; y < nslices; y++)
			
 
				+		{
			
 
				+			task = starpu_task_create();
			
 
				+			task->cl = &cl_init_matrix_zero;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			CHECK_TASK_SUBMIT(ret);
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	for (x = 0; x < nslices; x++)
			
 
				+	for (y = 0; y < nslices; y++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
			
 
				+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
			
 
				+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		CHECK_TASK_SUBMIT(ret);
			
 
				+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	starpu_fxt_start_profiling();
			
 
				+
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&thread_barrier);
			
 
				+
			
 
				+	start = starpu_timing_now();
			
 
				+	starpu_resume();
			
 
				+	starpu_task_wait_for_all();
			
 
				+	end = starpu_timing_now();
			
 
				+	starpu_pause(); // Pause not to disturb comm thread if it isn't done
			
 
				+
			
 
				+	double timing = end - start;
			
 
				+	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
			
 
				+
			
 
				+	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
			
 
				+
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(B_handle);
			
 
				+	starpu_data_unregister(C_handle);
			
 
				+
			
 
				+	if (check)
			
 
				+		check_output();
			
 
				+
			
 
				+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				+
			
 
				+
			
 
				+	// Wait comm thread:
			
 
				+	STARPU_PTHREAD_JOIN(comm_thread, NULL);
			
 
				+	STARPU_PTHREAD_BARRIER_DESTROY(&thread_barrier);
			
 
				+
			
 
				+	starpu_fxt_stop_profiling();
			
 
				+
			
 
				+	starpu_resume();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -0,0 +1,215 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * sendrecv benchmark from different tasks, executed simultaneously on serveral
			
 
				+ * workers.
			
 
				+ * Inspired a lot from NewMadeleine examples/piom/nm_piom_pingpong.c
			
 
				+ *
			
 
				+ * The goal is to measure impact of calls to starpu_mpi_* from different threads.
			
 
				+ *
			
 
				+ * Use STARPU_NCPU to set the number of parallel ping pongs
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+#include "bench_helper.h"
			
 
				+#include "abstract_sendrecv_bench.h"
			
 
				+
			
 
				+#define NB_WARMUP_PINGPONGS 10
			
 
				+
			
 
				+/* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
			
 
				+#undef NX_MAX
			
 
				+#define NX_MAX (64 * 1024 * 1024)
			
 
				+
			
 
				+
			
 
				+void cpu_task(void* descr[], void* args)
			
 
				+{
			
 
				+	int mpi_rank;
			
 
				+	uint64_t iterations = LOOPS_DEFAULT / 100;
			
 
				+	uint64_t s;
			
 
				+	starpu_data_handle_t handle_send, handle_recv;
			
 
				+	double t1, t2;
			
 
				+	int asked_worker;
			
 
				+	int current_worker = starpu_worker_get_id();
			
 
				+
			
 
				+	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
			
 
				+
			
 
				+	STARPU_ASSERT(asked_worker == current_worker);
			
 
				+
			
 
				+	iterations = bench_nb_iterations(iterations, s);
			
 
				+	double* lats = malloc(sizeof(double) * iterations);
			
 
				+
			
 
				+	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
			
 
				+	{
			
 
				+		if (mpi_rank == 0)
			
 
				+		{
			
 
				+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
			
 
				+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for (uint64_t j = 0; j < iterations; j++)
			
 
				+	{
			
 
				+		if (mpi_rank == 0)
			
 
				+		{
			
 
				+			t1 = starpu_timing_now();
			
 
				+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
			
 
				+			t2 = starpu_timing_now();
			
 
				+
			
 
				+			lats[j] =  (t2 - t1) / 2;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
			
 
				+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (mpi_rank == 0)
			
 
				+	{
			
 
				+		qsort(lats, iterations, sizeof(double), &comp_double);
			
 
				+
			
 
				+		const double min_lat = lats[0];
			
 
				+		const double max_lat = lats[iterations - 1];
			
 
				+		const double med_lat = lats[(iterations - 1) / 2];
			
 
				+		const double d1_lat = lats[(iterations - 1) / 10];
			
 
				+		const double d9_lat = lats[9 * (iterations - 1) / 10];
			
 
				+		double avg_lat = 0.0;
			
 
				+
			
 
				+		for(uint64_t k = 0; k < iterations; k++)
			
 
				+		{
			
 
				+			avg_lat += lats[k];
			
 
				+		}
			
 
				+
			
 
				+		avg_lat /= iterations;
			
 
				+		const double bw_million_byte = s / min_lat;
			
 
				+		const double bw_mbyte        = bw_million_byte / 1.048576;
			
 
				+
			
 
				+		printf("%2d\t\t%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
			
 
				+			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
			
 
				+		fflush(stdout);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = { cpu_task },
			
 
				+	.cpu_funcs_name = { "cpu_task" },
			
 
				+	.nbuffers = 0
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, worldsize;
			
 
				+	int mpi_init;
			
 
				+
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+
			
 
				+	if (worldsize < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				+
			
 
				+		starpu_mpi_shutdown();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		printf("Times in us\n");
			
 
				+		printf("# worker | size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
			
 
				+	}
			
 
				+	else if (rank >= 2)
			
 
				+	{
			
 
				+		starpu_mpi_shutdown();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	unsigned cpu_count = starpu_cpu_worker_get_count();
			
 
				+	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
			
 
				+	unsigned tag = 0;
			
 
				+
			
 
				+	int* workers = malloc(cpu_count * sizeof(int));
			
 
				+	float** vectors_send = malloc(cpu_count * sizeof(float*));
			
 
				+	float** vectors_recv = malloc(cpu_count * sizeof(float*));
			
 
				+	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
			
 
				+	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				+	{
			
 
				+		starpu_pause();
			
 
				+
			
 
				+		for (int i = 0; i < cpu_count; i++)
			
 
				+		{
			
 
				+			workers[i] = i;
			
 
				+			vectors_send[i] = malloc(s);
			
 
				+			vectors_recv[i] = malloc(s);
			
 
				+			memset(vectors_send[i], 0, s);
			
 
				+			memset(vectors_recv[i], 0, s);
			
 
				+
			
 
				+			starpu_vector_data_register(&handles_send[i], STARPU_MAIN_RAM, (uintptr_t) vectors_send[i], s, 1);
			
 
				+			starpu_vector_data_register(&handles_recv[i], STARPU_MAIN_RAM, (uintptr_t) vectors_recv[i], s, 1);
			
 
				+
			
 
				+			starpu_task_insert(&cl,
			
 
				+					STARPU_EXECUTE_ON_WORKER, workers[i],
			
 
				+					STARPU_VALUE, &rank, sizeof(int),
			
 
				+					STARPU_VALUE, workers + i, sizeof(int),
			
 
				+					STARPU_VALUE, &s, sizeof(uint64_t),
			
 
				+					STARPU_VALUE, &handles_send[i], sizeof(starpu_data_handle_t),
			
 
				+					STARPU_VALUE, &handles_recv[i], sizeof(starpu_data_handle_t), 0);
			
 
				+		}
			
 
				+
			
 
				+		starpu_resume();
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+		for (unsigned i = 0; i < cpu_count; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(handles_send[i]);
			
 
				+			starpu_data_unregister(handles_recv[i]);
			
 
				+			free(vectors_send[i]);
			
 
				+			free(vectors_recv[i]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(workers);
			
 
				+	free(vectors_send);
			
 
				+	free(vectors_recv);
			
 
				+	free(handles_send);
			
 
				+	free(handles_recv);
			
 
				+	free(mpi_tags);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -81,6 +81,20 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 
				 }
			
 
				 
			
 
				 /*
			
 
				+ * PER WORKER model
			
 
				+ */
			
 
				+
			
 
				+static double per_worker_task_expected_perf(struct starpu_perfmodel *model, unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	double (*worker_cost_function)(struct starpu_task *task, unsigned workerid, unsigned nimpl);
			
 
				+
			
 
				+	worker_cost_function = model->worker_cost_function;
			
 
				+	STARPU_ASSERT_MSG(worker_cost_function, "STARPU_PER_WORKER needs worker_cost_function to be defined");
			
 
				+
			
 
				+	return worker_cost_function(task, workerid, nimpl);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				  * PER ARCH model
			
 
				  */
			
 
				 
			
@@ -156,6 +170,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
				 
			
 
				 	switch (model->type)
			
 
				 	{
			
 
				+		case STARPU_PER_WORKER:
			
 
				 		case STARPU_PER_ARCH:
			
 
				 		case STARPU_COMMON:
			
 
				 			/* Nothing more to do than init */
			
@@ -220,6 +235,20 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 	return exp_perf;
			
 
				 }
			
 
				 
			
 
				+static double starpu_model_worker_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
			
 
				+{
			
 
				+	if (!model)
			
 
				+		return 0.0;
			
 
				+
			
 
				+	if (model->type == STARPU_PER_WORKER)
			
 
				+		return per_worker_task_expected_perf(model, workerid, task, nimpl);
			
 
				+	else
			
 
				+	{
			
 
				+		struct starpu_perfmodel_arch *per_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				+		return starpu_model_expected_perf(task, model, per_arch, nimpl);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	if (!task->cl)
			
@@ -228,6 +257,14 @@ double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfm
 
				 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
			
 
				+{
			
 
				+	if (!task->cl)
			
 
				+		/* Tasks without codelet don't actually take time */
			
 
				+		return 0.0;
			
 
				+	return starpu_model_worker_expected_perf(task, task->cl->model, workerid, sched_ctx_id, nimpl);
			
 
				+}
			
 
				+
			
 
				 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				 {
			
 
				 	if (!task->cl)
			
@@ -236,6 +273,15 @@ double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfm
 
				 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
			
 
				 }
			
 
				 
			
 
				+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
			
 
				+{
			
 
				+	if (!task->cl)
			
 
				+		/* Tasks without codelet don't actually take time */
			
 
				+		return 0.0;
			
 
				+	return starpu_model_worker_expected_perf(task, task->cl->energy_model, workerid, sched_ctx_id, nimpl);
			
 
				+
			
 
				+}
			
 
				+
			
 
				 double starpu_task_expected_conversion_time(struct starpu_task *task,
			
 
				 					    struct starpu_perfmodel_arch* arch,
			
 
				 					    unsigned nimpl)
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -49,7 +49,6 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
				 	    workerid != -1;
			
 
				 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
			
 
				 		int nimpl;
			
 
				 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
@@ -59,9 +58,13 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
				 				double d;
			
 
				 				can_execute = 1;
			
 
				 				if(bundle)
			
 
				+				{
			
 
				+					struct starpu_perfmodel_arch* archtype =
			
 
				+						starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
			
 
				 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
			
 
				+				}
			
 
				 				else
			
 
				-					d = starpu_task_expected_length(task, archtype, nimpl);
			
 
				+					d = starpu_task_worker_expected_length(task, workerid, component->tree->sched_ctx_id, nimpl);
			
 
				 				if(isnan(d))
			
 
				 				{
			
 
				 					*length = d;
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -487,7 +487,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 			}
			
 
				 
			
 
				 			double exp_end;
			
 
				-			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
			
 
				 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
			
 
				 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
@@ -679,9 +679,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
			
 
				 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				-				local_energy[worker_ctx][nimpl] = starpu_task_expected_energy(task, perf_arch,nimpl);
			
 
				+				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
			
 
				 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
			
 
				 				if (conversion_time > 0.0)
			
 
				 					local_task_length[worker_ctx][nimpl] += conversion_time;
			
@@ -1100,10 +1100,9 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 {
			
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				-	/* Compute the expected penality */
			
 
				-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
			
 
				 
			
 
				-	double predicted = starpu_task_expected_length(task, perf_arch,
			
 
				+	/* Compute the expected penality */
			
 
				+	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
			
 
				 						       starpu_task_get_implementation(task));
			
 
				 
			
 
				 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
			
--- a/tools/starpu_replay.c
+++ b/tools/starpu_replay.c
@@ -1085,13 +1085,23 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		else if (TEST("Sizes"))
			
 
				 		{
			
 
				+			*ln = 0;
			
 
				 			char *  buffer = s + 7;
			
 
				 			const char * delim = " ";
			
 
				-			char * token = strtok(buffer, delim);
			
 
				+			unsigned nb_parameters_line = count_number_tokens(buffer, delim); 
			
 
				 			unsigned k = 0;
			
 
				 
			
 
				+			if(nb_parameters == 0)
			
 
				+			{
			
 
				+				nb_parameters = nb_parameters_line; 
			
 
				+				arrays_managing(set_alloc_mode(nb_parameters));
			
 
				+			}
			
 
				+			else
			
 
				+				STARPU_ASSERT(nb_parameters == nb_parameters_line);
			
 
				+
			
 
				 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
			
 
				 
			
 
				+			char * token = strtok(buffer, delim);
			
 
				 			while (token != NULL && k < nb_parameters)
			
 
				 			{
			
 
				 				sizes_set[k] = strtol(token, NULL, 10);