5 years ago · ce489e4de4
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@
 
																 /build
															
 
																 /build2
															
 
																 /build-aux
															
 
																+/build_starpu
															
 
																+/install
															
 
																 /GPATH
															
 
																 /GRTAGS
															
 
																 /GTAGS
															
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,7 @@ New features:
 
																   * New number_events.data trace file which monitors number of events in trace
															
 
																     files. This file can be parsed by the new script
															
 
																     starpu_fxt_number_events_to_names.py to convert event keys to event names.
															
 
																+  * New STARPU_PER_WORKER perfmodel.
															
 
																 Small changes:
															
 
																   * Use the S4U interface of Simgrid instead of xbt and MSG.
															
--- a/configure.ac
+++ b/configure.ac
@@ -92,6 +92,7 @@ if test x$enable_perf_debug = xyes; then
 
																     enable_shared=no
															
 
																 fi
															
 
																 default_enable_mpi_check=maybe
															
 
																+default_enable_mpi=maybe
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
@@ -206,6 +207,9 @@ if test x$enable_simgrid = xyes ; then
 
																         # want that by default
															
 
																 	default_enable_mpi_check=no
															
 
																+	# disable MPI support by default
															
 
																+	default_enable_mpi=no
															
 
																+
															
 
																 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
															
 
																 	AC_LANG_PUSH([C++])
															
 
																 	if test x$enable_shared = xno ; then
															
@@ -270,145 +274,146 @@ fi
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																-#                                    MPI                                      #
															
 
																+#                                LIBTOOLS                                     #
															
 
																 #                                                                             #
															
 
																 ###############################################################################
															
 
																-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
															
 
																-                              [Disable StarPU MPI library generation])],
															
 
																-            [enable_mpi=$enableval],
															
 
																-            [enable_mpi=yes])
															
 
																+#c++11 detection
															
 
																+AX_CXX_COMPILE_STDCXX(11,noext,optional)
															
 
																-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
															
 
																-                              [Enable StarPU to run with the master-slave mode])],
															
 
																-            use_mpi_master_slave=$enableval,
															
 
																-            use_mpi_master_slave=no)
															
 
																+AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
															
 
																+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
															
 
																+if test $HAVE_CXX11 -eq 1; then
															
 
																+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
															
 
																+fi
															
 
																-#Check MPICC
															
 
																-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
															
 
																-           [Path of the mpicc compiler])],
															
 
																-   [
															
 
																-       if test x$withval = xyes; then
															
 
																-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
															
 
																-       elif test x$withval = xno ; then
															
 
																-           mpi_requested=no
															
 
																-	   mpicc_path=""
															
 
																-	   use_mpi=no
															
 
																-       else
															
 
																-	   mpi_requested=yes
															
 
																-           mpicc_path=$withval
															
 
																-       fi
															
 
																-   ],
															
 
																-   [
															
 
																-       mpi_requested=maybe
															
 
																-       if test x$enable_simgrid = xyes ; then
															
 
																-           DEFAULT_MPICC=smpicc
															
 
																-       else
															
 
																-           DEFAULT_MPICC=mpicc
															
 
																-       fi
															
 
																-       # nothing was specified: default value is used
															
 
																-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
															
 
																-   ])
															
 
																+LT_PREREQ([2.2])
															
 
																+LT_INIT([win32-dll])
															
 
																-# in case MPI was explicitely required, but is not available, this is an error
															
 
																-if test x$mpi_requested = xyes -a ! -x "$mpicc_path"; then
															
 
																-   AC_MSG_ERROR([Compiler MPI not valid])
															
 
																-fi
															
 
																+AC_HEADER_STDC
															
 
																-if test x$mpi_requested != xno ; then
															
 
																-   # We test if the MPICC compiler exists
															
 
																-     if test ! -x $mpicc_path; then
															
 
																-         #MPICC does not exists or is not executable
															
 
																-	 AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
															
 
																-	 use_mpi=no
															
 
																-     else
															
 
																-	 use_mpi=yes
															
 
																-	 if test x$enable_simgrid = xyes ; then
															
 
																-             AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
															
 
																-						  [Path of the smpirun helper])],
															
 
																-			 [
															
 
																-			     if test x$withval = xyes; then
															
 
																-				 AC_MSG_ERROR(--with-smpirun must be given a pathname)
															
 
																-			     else
															
 
																-				 smpirun_path=$withval
															
 
																-			     fi
															
 
																-			 ],
															
 
																-			 [
															
 
																-			     # nothing was specified: default value is used
															
 
																-			     AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
															
 
																-			 ])
															
 
																-	 fi
															
 
																-     fi
															
 
																+AC_C_RESTRICT
															
 
																+
															
 
																+# Check if bash is available
															
 
																+AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
															
 
																+
															
 
																+# Record git version
															
 
																+AC_PATH_PROG(gitcommand, git)
															
 
																+if test "$gitcommand" = "" ; then
															
 
																+   if test -f $srcdir/STARPU-REVISION ; then
															
 
																+      cp $srcdir/STARPU-REVISION .
															
 
																+   else
															
 
																+      echo "unknown" > ./STARPU-REVISION
															
 
																+   fi
															
 
																+else
															
 
																+   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
															
 
																 fi
															
 
																-AC_MSG_CHECKING(mpicc path)
															
 
																+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
															
 
																+
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                           MPI compilers                                     #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+#Check MPICC
															
 
																+if test x$enable_simgrid = xyes ; then
															
 
																+    DEFAULT_MPICC=smpicc
															
 
																+else
															
 
																+    DEFAULT_MPICC=mpicc
															
 
																+fi
															
 
																+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
															
 
																+AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
															
 
																+AC_MSG_CHECKING(whether mpicc is available)
															
 
																 AC_MSG_RESULT($mpicc_path)
															
 
																 AC_SUBST(MPICC, $mpicc_path)
															
 
																-
															
 
																 #Check MPICXX/MPIC++
															
 
																-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
															
 
																-           [Path of the mpicxx/mpic++ compiler])],
															
 
																-   [
															
 
																-       if test x$withval = xyes; then
															
 
																-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
															
 
																-       else
															
 
																-           mpicxx_path=$withval
															
 
																-       fi
															
 
																-   ],
															
 
																-   [
															
 
																-       if test x$enable_simgrid = xyes ; then
															
 
																-           DEFAULT_MPICXX=smpicxx
															
 
																-       else
															
 
																-           DEFAULT_MPICXX=mpicxx
															
 
																-       fi
															
 
																-       # nothing was specified: default value is used
															
 
																-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
															
 
																+if test x$enable_simgrid = xyes ; then
															
 
																+    DEFAULT_MPICXX=smpicxx
															
 
																+else
															
 
																+    DEFAULT_MPICXX=mpicxx
															
 
																+fi
															
 
																+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
															
 
																+AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
															
 
																-       # try with mpic++ if mpicxx was not found
															
 
																-       if test x$mpicxx_path = xno ; then
															
 
																-            DEFAULT_MPICXX=mpic++
															
 
																-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
															
 
																-       fi
															
 
																-   ])
															
 
																+# try with mpic++ if mpicxx was not found
															
 
																+if test x$mpicxx_path = xno ; then
															
 
																+    DEFAULT_MPICXX=mpic++
															
 
																+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
															
 
																+fi
															
 
																 # We test if the MPICXX/MPIC++ compiler exists
															
 
																 if test ! -x $mpicxx_path; then
															
 
																-    #MPICXX/MPIC++ does not exists or is not executable
															
 
																     AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
															
 
																-    use_mpicxx=no
															
 
																-else
															
 
																-    use_mpicxx=yes
															
 
																+    mpicxx_path=no
															
 
																 fi
															
 
																-AC_MSG_CHECKING(mpicxx/mpic++ path)
															
 
																+AC_MSG_CHECKING(whether mpicxx is available)
															
 
																 AC_MSG_RESULT($mpicxx_path)
															
 
																 AC_SUBST(MPICXX, $mpicxx_path)
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                                    MPI                                      #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
															
 
																+                              [Disable StarPU MPI library generation])],
															
 
																+            [enable_mpi=$enableval],
															
 
																+            [enable_mpi=$default_enable_mpi])
															
 
																-if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
															
 
																-    cc_or_mpicc=$mpicc_path
															
 
																-        # For some reason, libtool uses gcc instead of mpicc when linking
															
 
																-        # libstarpumpi.
															
 
																-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
															
 
																-        # references to MPI_*). We manually add the required flags to fix this
															
 
																-        # issue.
															
 
																-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
															
 
																-else
															
 
																-    cc_or_mpicc=$CC
															
 
																+if test x$enable_mpi = xmaybe ; then
															
 
																+    if test -x "$mpicc_path"; then
															
 
																+	enable_mpi=yes
															
 
																+    else
															
 
																+	enable_mpi=no
															
 
																+    fi
															
 
																 fi
															
 
																-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
															
 
																+# in case MPI was explicitely required, but mpicc is not available, this is an error
															
 
																+if test x$enable_mpi = xyes -a ! -x "$mpicc_path"; then
															
 
																+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
															
 
																+fi
															
 
																-AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
															
 
																-				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
															
 
																-				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
															
 
																-if test x$enable_mpi_pedantic_isend = xyes; then
															
 
																-	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
															
 
																+build_mpi_lib=$enable_mpi
															
 
																+
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                                NEW MADELEINE                                #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
															
 
																+		                    [Enable StarPU MPI library generation using the new madeleine backend])],
															
 
																+            [enable_nmad=$enableval],
															
 
																+            [enable_nmad=no])
															
 
																+
															
 
																+build_nmad_lib=no
															
 
																+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
															
 
																+#We can only build StarPU MPI Library if User wants it and MPI is available
															
 
																+if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
															
 
																+    build_nmad_lib=yes
															
 
																+    build_mpi_lib=no
															
 
																+    PKG_CHECK_MODULES([NMAD],[nmad])
															
 
																+else
															
 
																+    build_nmad_lib=no
															
 
																 fi
															
 
																-#We can only build MPI Master Slave if User wants it and MPI is available
															
 
																-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                             MPI Master Slave                                #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
															
 
																+                              [Enable StarPU to run with the master-slave mode])],
															
 
																+              use_mpi_master_slave=$enableval,
															
 
																+              use_mpi_master_slave=no)
															
 
																+#We can only build MPI Master Slave if User wants it and MPI compiler are available
															
 
																+if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
															
 
																     build_mpi_master_slave=yes
															
 
																 else
															
 
																     build_mpi_master_slave=no
															
@@ -417,7 +422,9 @@ fi
 
																 #users cannot use both at the same time
															
 
																 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
															
 
																     AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
															
 
																-	enable_mpi=no
															
 
																+    build_mpi_lib=no
															
 
																+    build_nmad_lib=no
															
 
																+    enable_mpi=no
															
 
																 fi
															
 
																 if test x$build_mpi_master_slave = xyes; then
															
@@ -449,95 +456,19 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 
																 AC_MSG_RESULT($nmaxmpidev)
															
 
																 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
															
 
																-
															
 
																-###############################################################################
															
 
																-#                                                                             #
															
 
																-#                                NEW MADELEINE                                #
															
 
																-#                                                                             #
															
 
																-###############################################################################
															
 
																-
															
 
																-AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
															
 
																-                              [Enable StarPU MPI library generation using new madeleine instead of mpi])],
															
 
																-            [enable_nmad=$enableval],
															
 
																-            [enable_nmad=no])
															
 
																-
															
 
																-if test x$use_mpi = xyes -a \( x$enable_nmad \) ; then
															
 
																-    cc_or_mpicc=$mpicc_path
															
 
																-        # For some reason, libtool uses gcc instead of mpicc when linking
															
 
																-        # libstarpumpi.
															
 
																-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
															
 
																-        # references to MPI_*). We manually add the required flags to fix this
															
 
																-        # issue.
															
 
																-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
															
 
																-else
															
 
																-    cc_or_mpicc=$CC
															
 
																-fi
															
 
																-
															
 
																-build_nmad_lib=no
															
 
																-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
															
 
																-#We can only build StarPU MPI Library if User wants it and MPI is available
															
 
																-if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
															
 
																-    build_nmad_lib=yes
															
 
																-    enable_mpi=no
															
 
																-    PKG_CHECK_MODULES([NMAD],[nmad])
															
 
																-else
															
 
																-    build_nmad_lib=no
															
 
																-fi
															
 
																-
															
 
																-# in case NMAD was explicitely required, but the compiler MPI, this is an error
															
 
																-if test x$enable_nmad = xyes -a ! -x "$mpicc_path"; then
															
 
																-   AC_MSG_ERROR([Compiler MPI not valid])
															
 
																-fi
															
 
																-
															
 
																-
															
 
																-AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
															
 
																-AC_MSG_RESULT($build_nmad_lib)
															
 
																-
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																-#                                LIBTOOLS                                     #
															
 
																+#                       Miscellaneous things for MPI                          #
															
 
																 #                                                                             #
															
 
																 ###############################################################################
															
 
																-#c++11 detection
															
 
																-AX_CXX_COMPILE_STDCXX(11,noext,optional)
															
 
																-
															
 
																-AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
															
 
																-AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
															
 
																-if test $HAVE_CXX11 -eq 1; then
															
 
																-  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
															
 
																-fi
															
 
																-
															
 
																-LT_PREREQ([2.2])
															
 
																-LT_INIT([win32-dll])
															
 
																-
															
 
																-AC_HEADER_STDC
															
 
																-
															
 
																-AC_C_RESTRICT
															
 
																-
															
 
																-# Check if bash is available
															
 
																-AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
															
 
																-
															
 
																-# Record git version
															
 
																-AC_PATH_PROG(gitcommand, git)
															
 
																-if test "$gitcommand" = "" ; then
															
 
																-   if test -f $srcdir/STARPU-REVISION ; then
															
 
																-      cp $srcdir/STARPU-REVISION .
															
 
																-   else
															
 
																-      echo "unknown" > ./STARPU-REVISION
															
 
																-   fi
															
 
																-else
															
 
																-   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
															
 
																+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
															
 
																+				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
															
 
																+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
															
 
																+if test x$enable_mpi_pedantic_isend = xyes; then
															
 
																+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
															
 
																 fi
															
 
																-AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
															
 
																-
															
 
																-###############################################################################
															
 
																-#                                                                             #
															
 
																-#                       Miscellaneous things for MPI                          #
															
 
																-#                                                                             #
															
 
																-###############################################################################
															
 
																-
															
 
																 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
															
 
																 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
															
 
																 	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
															
@@ -551,68 +482,45 @@ fi
 
																 if test x$enable_mpi_check = xno ; then
															
 
																     running_mpi_check=no
															
 
																 fi
															
 
																+if test x$enable_mpi = xno ; then
															
 
																+    running_mpi_check=no
															
 
																+fi
															
 
																-
															
 
																-if test x$enable_simgrid = xno ; then
															
 
																+if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
															
 
																     # Check if mpiexec is available
															
 
																-    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
															
 
																-                [Path of mpiexec])],
															
 
																-        [
															
 
																-            if test x$withval = xyes; then
															
 
																-                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
															
 
																-            else
															
 
																-                mpiexec_path=$withval
															
 
																-            fi
															
 
																-        ],
															
 
																-        [
															
 
																-            # nothing was specified: look in the path
															
 
																-	    if test x$mpicc_path = x ; then
															
 
																-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$PATH])
															
 
																-	    else
															
 
																-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
															
 
																-	    fi
															
 
																-        ])
															
 
																-
															
 
																+    if test x$enable_simgrid = xyes ; then
															
 
																+	DEFAULT_MPIEXEC=smpirun
															
 
																+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
															
 
																+	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
															
 
																+    else
															
 
																+	DEFAULT_MPIEXEC=mpiexec
															
 
																+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
															
 
																+	if test x$mpicc_path = x ; then
															
 
																+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
															
 
																+	else
															
 
																+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
															
 
																+	fi
															
 
																+    fi
															
 
																     AC_MSG_CHECKING(whether mpiexec is available)
															
 
																     AC_MSG_RESULT($mpiexec_path)
															
 
																     # We test if MPIEXEC exists
															
 
																     if test ! -x $mpiexec_path; then
															
 
																-        # if it's not valid, it could be the parameter given to configure.ac was not a full path, let's look for it
															
 
																-	if test x$mpicc_path = x ; then
															
 
																-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$PATH])
															
 
																-	else
															
 
																-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$(dirname $mpicc_path):$PATH])
															
 
																-	fi
															
 
																-        AC_MSG_CHECKING(whether mpiexec is available (2nd try))
															
 
																-        AC_MSG_RESULT($mpiexec_path_bis)
															
 
																-	if test -x $mpiexec_path_bis; then
															
 
																-	   mpiexec_path=$mpiexec_path_bis
															
 
																-	else
															
 
																-           #MPIEXEC does not exists or is not executable
															
 
																-           AC_MSG_RESULT(The mpiexec script is not valid)
															
 
																-           running_mpi_check=no
															
 
																-           mpiexec_path=""
															
 
																-	fi
															
 
																+        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
															
 
																+        running_mpi_check=no
															
 
																+        mpiexec_path=""
															
 
																     fi
															
 
																     AC_SUBST(MPIEXEC,$mpiexec_path)
															
 
																 fi
															
 
																 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
															
 
																-if test x$use_mpi = xyes ; then
															
 
																-    AC_MSG_CHECKING(whether MPI tests should be run)
															
 
																-    AC_MSG_RESULT($running_mpi_check)
															
 
																-fi
															
 
																-
															
 
																-#We can only build StarPU MPI Library if User wants it and MPI is available
															
 
																-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
															
 
																-    build_mpi_lib=yes
															
 
																-else
															
 
																-    build_mpi_lib=no
															
 
																-fi
															
 
																+AC_MSG_CHECKING(whether MPI tests should be run)
															
 
																+AC_MSG_RESULT($running_mpi_check)
															
 
																 AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
															
 
																 AC_MSG_RESULT($build_mpi_lib)
															
 
																+AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
															
 
																+AC_MSG_RESULT($build_nmad_lib)
															
 
																 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes)
															
 
																 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
															
@@ -622,11 +530,9 @@ if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
 
																 	else
															
 
																 		AC_DEFINE(STARPU_USE_MPI_NMAD,[1],[whether the StarPU MPI library (with a NewMadeleine implementation) is available])
															
 
																 	fi
															
 
																-else
															
 
																-	running_mpi_check=no
															
 
																 fi
															
 
																-if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
															
 
																+if test x$enable_mpi = xyes ; then
															
 
																     if test x$enable_simgrid = xyes ; then
															
 
																         if test x$enable_shared = xyes ; then
															
 
																 	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
															
@@ -644,17 +550,16 @@ AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 
																 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
															
 
																 AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
															
 
																-			[Arguments for mpiexec])],
															
 
																-	[
															
 
																+					  [Arguments for mpiexec])],
															
 
																+	    [
															
 
																 		mpiexec_args=$withval
															
 
																-	])
															
 
																+	    ])
															
 
																 AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
															
 
																-
															
 
																 AC_MSG_CHECKING(whether MPI debug messages should be displayed)
															
 
																 AC_ARG_ENABLE(mpi-verbose, [AS_HELP_STRING([--enable-mpi-verbose],
															
 
																-			[display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
															
 
																-			enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
															
 
																+					   [display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
															
 
																+	      enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
															
 
																 AC_MSG_RESULT($enable_mpi_verbose)
															
 
																 if test x$enable_mpi_verbose = xyes; then
															
 
																 	AC_DEFINE(STARPU_MPI_VERBOSE, [1], [display MPI verbose debug messages])
															
@@ -664,6 +569,19 @@ if test x$enable_mpi_verbose = xextra; then
 
																 	AC_DEFINE(STARPU_MPI_EXTRA_VERBOSE, [1], [display MPI verbose debug messages])
															
 
																 fi
															
 
																+if test x$enable_mpi = xyes -o x$build_mpi_master_slave = xyes ; then
															
 
																+    cc_or_mpicc=$mpicc_path
															
 
																+    # For some reason, libtool uses gcc instead of mpicc when linking
															
 
																+    # libstarpumpi.
															
 
																+    # On Darwin (and maybe other systems ?) the linker will fail (undefined
															
 
																+    # references to MPI_*). We manually add the required flags to fix this
															
 
																+    # issue.
															
 
																+    AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
															
 
																+else
															
 
																+    cc_or_mpicc=$CC
															
 
																+fi
															
 
																+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
															
 
																+
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																 #                           MIC device compilation                            #
															
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -602,7 +602,7 @@ whole machine, it would not be efficient to accumulate them in only one place,
 
																 incurring data transmission each and access concurrency.
															
 
																 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
															
 
																-this case: it will allocate a buffer on each memory node, and accumulate
															
 
																+this case: it will allocate a buffer on each worker (lazily), and accumulate
															
 
																 intermediate results there. When the data is eventually accessed in the normal
															
 
																 mode ::STARPU_R, StarPU will collect the intermediate results in just one
															
 
																 buffer.
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -190,9 +190,10 @@ single task gives the consumption of the task in Joules, which can be given to
 
																 starpu_perfmodel_update_history().
															
 
																 Another way to provide the energy performance is to define a
															
 
																-perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
															
 
																-starpu_perfmodel::arch_cost_function field to a function which shall return the
															
 
																-estimated consumption of the task in Joules. Such a function can for instance
															
 
																+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH or
															
 
																+::STARPU_PER_WORKER , and set the starpu_perfmodel::arch_cost_function or
															
 
																+starpu_perfmodel::worker_cost_function field to a function which shall return
															
 
																+the estimated consumption of the task in Joules. Such a function can for instance
															
 
																 use starpu_task_expected_length() on the task (in µs), multiplied by the
															
 
																 typical power consumption of the device, e.g. in W, and divided by 1000000. to
															
 
																 get Joules.
															
--- a/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
+++ b/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
@@ -45,7 +45,8 @@ provides a complete list of the functions available for writing advanced schedul
 
																 This includes getting an estimation for a task computation completion with
															
 
																 starpu_task_expected_length(), for the required data transfers with
															
 
																 starpu_task_expected_data_transfer_time_for(), for the required energy with
															
 
																-starpu_task_expected_energy(), etc. Other
															
 
																+starpu_task_expected_energy(), etc. Per-worker variants are also available with
															
 
																+starpu_task_worker_expected_length(), etc. Other
															
 
																 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
															
 
																 starpu_transfer_predict(), ...
															
 
																 One can also directly test the presence of a data handle with starpu_data_is_on_node().
															
--- a/doc/doxygen/chapters/370_online_performance_tools.doxy
+++ b/doc/doxygen/chapters/370_online_performance_tools.doxy
@@ -401,6 +401,11 @@ filled with pointers to functions which return the expected duration
 
																 of the task in micro-seconds, one per architecture, see for instance
															
 
																 <c>tests/datawizard/locality.c</c>
															
 
																 </li>
															
 
																+
															
 
																+<li>
															
 
																+Provided explicitly by the application (model type ::STARPU_PER_WORKER)
															
 
																+similarly with the starpu_perfmodel::worker_cost_function field.
															
 
																+</li>
															
 
																 </ul>
															
 
																 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -165,6 +165,7 @@ struct starpu_perfmodel_per_arch
 
																 enum starpu_perfmodel_type
															
 
																 {
															
 
																         STARPU_PERFMODEL_INVALID=0,
															
 
																+	STARPU_PER_WORKER,                /**< Application-provided per-worker cost model function */
															
 
																 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
															
 
																 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
															
 
																 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
															
@@ -226,11 +227,17 @@ struct starpu_perfmodel
 
																 	*/
															
 
																 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
															
 
																 	/**
															
 
																-	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
															
 
																+	   Used by ::STARPU_PER_ARCH. Take a task, an arch and implementation
															
 
																 	   number, and must return a task duration estimation in
															
 
																 	   micro-seconds on that arch.
															
 
																 	*/
															
 
																 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
															
 
																+	/**
															
 
																+	   Used by ::STARPU_PER_WORKER. Take a task, a worker id and implementation
															
 
																+	   number, and must return a task duration estimation in
															
 
																+	   micro-seconds on that worker.
															
 
																+	*/
															
 
																+	double (*worker_cost_function)(struct starpu_task *, unsigned workerid, unsigned nimpl);
															
 
																 	/**
															
 
																 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -110,6 +110,10 @@ struct starpu_sched_policy
 
																 	   to be executed by the worker. This method therefore permits
															
 
																 	   to keep the state of the scheduler coherent even when
															
 
																 	   StarPU bypasses the scheduling strategy.
															
 
																+
															
 
																+	   Note: to get an estimation of the task duration, \p perf_workerid
															
 
																+	   needs to be used rather than \p workerid, for the case of parallel
															
 
																+	   tasks.
															
 
																 	*/
															
 
																 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
															
@@ -366,6 +370,11 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task);
 
																 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
															
 
																 /**
															
 
																+   Same as starpu_task_expected_length() but for a precise worker.
															
 
																+*/
															
 
																+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
															
 
																+
															
 
																+/**
															
 
																    Return an estimated speedup factor relative to CPU speed
															
 
																 */
															
 
																 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
															
@@ -395,6 +404,11 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 
																 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
															
 
																 /**
															
 
																+   Same as starpu_task_expected_energy but for a precise worker
															
 
																+*/
															
 
																+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
															
 
																+
															
 
																+/**
															
 
																    Return expected conversion time in ms (multiformat interface only)
															
 
																 */
															
 
																 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -50,9 +50,9 @@ extern "C"
 
																 int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
															
 
																 /**
															
 
																-   Same as starpu_mpi_init_conf(), except that this does not
															
 
																-   initialize the StarPU library. The caller thus has to call
															
 
																-   starpu_init() before this.
															
 
																+   Same as starpu_mpi_init_conf(), except that this does not initialize the
															
 
																+   StarPU library. The caller thus has to call starpu_init() before this, and it
															
 
																+   can not reserve a core for the MPI communications.
															
 
																 */
															
 
																 int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -137,7 +137,13 @@ starpu_mpi_TESTS +=				\
 
																 	temporary				\
															
 
																 	user_defined_datatype			\
															
 
																 	early_stuff				\
															
 
																-	sendrecv_bench
															
 
																+	sendrecv_bench				\
															
 
																+	sendrecv_parallel_tasks_bench
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+starpu_mpi_TESTS +=				\
															
 
																+	sendrecv_gemm_bench
															
 
																+endif
															
 
																 if !STARPU_SIMGRID
															
 
																 # missing support in simgrid
															
@@ -226,7 +232,9 @@ noinst_PROGRAMS =				\
 
																 	starpu_redefine				\
															
 
																 	load_balancer				\
															
 
																 	driver					\
															
 
																-	sendrecv_bench
															
 
																+	sendrecv_bench				\
															
 
																+	sendrecv_gemm_bench			\
															
 
																+	sendrecv_parallel_tasks_bench
															
 
																 XFAIL_TESTS=					\
															
 
																 	policy_register_toomany			\
															
@@ -256,4 +264,22 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 
																 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
															
 
																 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
															
 
																 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
															
 
																+
															
 
																+sendrecv_bench_SOURCES = sendrecv_bench.c
															
 
																+sendrecv_bench_SOURCES += bench_helper.c
															
 
																+sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																+
															
 
																+sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
															
 
																+sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
															
 
																+sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
															
 
																+sendrecv_gemm_bench_SOURCES += bench_helper.c
															
 
																+sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																+sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
															
 
																+
															
 
																+sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
															
 
																+endif
															
 
																+
															
 
																 endif
															
--- a/mpi/tests/abstract_sendrecv_bench.c
+++ b/mpi/tests/abstract_sendrecv_bench.c
@@ -0,0 +1,136 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "bench_helper.h"
															
 
																+#include "abstract_sendrecv_bench.h"
															
 
																+
															
 
																+
															
 
																+
															
 
																+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
															
 
																+{
															
 
																+	uint64_t iterations = LOOPS_DEFAULT;
															
 
																+
															
 
																+	if (mpi_rank >= 2)
															
 
																+	{
															
 
																+		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+		{
															
 
																+			iterations = bench_nb_iterations(iterations, s);
															
 
																+
															
 
																+			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+
															
 
																+			for (uint64_t j = 0; j < iterations; j++)
															
 
																+			{
															
 
																+				starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	if (mpi_rank == 0)
															
 
																+	{
															
 
																+		printf("Times in us\n");
															
 
																+		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
															
 
																+	}
															
 
																+
															
 
																+	int array_size = 0;
															
 
																+	starpu_data_handle_t handle_send, handle_recv;
															
 
																+	float* vector_send = NULL;
															
 
																+	float* vector_recv = NULL;
															
 
																+	double t1, t2, global_tstart, global_tend;
															
 
																+	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
															
 
																+
															
 
																+	if (thread_barrier != NULL)
															
 
																+	{
															
 
																+		STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
															
 
																+	}
															
 
																+
															
 
																+	global_tstart = starpu_timing_now();
															
 
																+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+	{
															
 
																+		vector_send = malloc(s);
															
 
																+		vector_recv = malloc(s);
															
 
																+		memset(vector_send, 0, s);
															
 
																+		memset(vector_recv, 0, s);
															
 
																+
															
 
																+		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
															
 
																+		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
															
 
																+
															
 
																+		iterations = bench_nb_iterations(iterations, s);
															
 
																+
															
 
																+		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+
															
 
																+		for (uint64_t j = 0; j < iterations; j++)
															
 
																+		{
															
 
																+			if (mpi_rank == 0)
															
 
																+			{
															
 
																+				t1 = starpu_timing_now();
															
 
																+				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
															
 
																+				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
															
 
																+				t2 = starpu_timing_now();
															
 
																+
															
 
																+				const double t = (t2 -t1) / 2;
															
 
																+
															
 
																+				lats[j] = t;
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
															
 
																+				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
															
 
																+			}
															
 
																+
															
 
																+			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+		}
															
 
																+
															
 
																+		if (mpi_rank == 0)
															
 
																+		{
															
 
																+			qsort(lats, iterations, sizeof(double), &comp_double);
															
 
																+
															
 
																+			const double min_lat = lats[0];
															
 
																+			const double max_lat = lats[iterations - 1];
															
 
																+			const double med_lat = lats[(iterations - 1) / 2];
															
 
																+			const double d1_lat = lats[(iterations - 1) / 10];
															
 
																+			const double d9_lat = lats[9 * (iterations - 1) / 10];
															
 
																+			double avg_lat = 0.0;
															
 
																+
															
 
																+			for(uint64_t k = 0; k < iterations; k++)
															
 
																+			{
															
 
																+				avg_lat += lats[k];
															
 
																+			}
															
 
																+
															
 
																+			avg_lat /= iterations;
															
 
																+			const double bw_million_byte = s / min_lat;
															
 
																+			const double bw_mbyte        = bw_million_byte / 1.048576;
															
 
																+
															
 
																+			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
															
 
																+				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
															
 
																+			fflush(stdout);
															
 
																+		}
															
 
																+		starpu_data_unregister(handle_recv);
															
 
																+		starpu_data_unregister(handle_send);
															
 
																+
															
 
																+		free(vector_send);
															
 
																+		free(vector_recv);
															
 
																+	}
															
 
																+	global_tend = starpu_timing_now();
															
 
																+
															
 
																+	if (mpi_rank == 0)
															
 
																+	{
															
 
																+		printf("Comm bench took %9.3lf ms\n", (global_tend - global_tstart) / 1000);
															
 
																+	}
															
 
																+
															
 
																+	free(lats);
															
 
																+}
															
--- a/mpi/tests/abstract_sendrecv_bench.h
+++ b/mpi/tests/abstract_sendrecv_bench.h
@@ -0,0 +1,21 @@
 
																+
															
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+
															
 
																+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);
															
--- a/mpi/tests/bench_helper.c
+++ b/mpi/tests/bench_helper.c
@@ -0,0 +1,62 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "bench_helper.h"
															
 
																+
															
 
																+
															
 
																+int comp_double(const void*_a, const void*_b)
															
 
																+{
															
 
																+	const double* a = _a;
															
 
																+	const double* b = _b;
															
 
																+
															
 
																+	if(*a < *b)
															
 
																+		return -1;
															
 
																+	else if(*a > *b)
															
 
																+		return 1;
															
 
																+	else
															
 
																+		return 0;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+uint64_t bench_next_size(uint64_t len)
															
 
																+{
															
 
																+	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
															
 
																+
															
 
																+	if(next <= len)
															
 
																+		next++;
															
 
																+
															
 
																+	return next;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+uint64_t bench_nb_iterations(int iterations, uint64_t len)
															
 
																+{
															
 
																+	const uint64_t max_data = NX_MAX;
															
 
																+
															
 
																+	if(len <= 0)
															
 
																+		len = 1;
															
 
																+
															
 
																+	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
															
 
																+
															
 
																+	if(data_size  > max_data)
															
 
																+	{
															
 
																+		iterations = (max_data / (uint64_t)len);
															
 
																+		if(iterations < 2)
															
 
																+			iterations = 2;
															
 
																+	}
															
 
																+
															
 
																+	return iterations;
															
 
																+}
															
--- a/mpi/tests/bench_helper.h
+++ b/mpi/tests/bench_helper.h
@@ -0,0 +1,37 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <math.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NX_MAX (512 * 1024 * 1024) // kB
															
 
																+#define NX_MIN 0
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#define MULT_DEFAULT 4
															
 
																+#else
															
 
																+#define MULT_DEFAULT 2
															
 
																+#endif
															
 
																+#define INCR_DEFAULT 0
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#define LOOPS_DEFAULT 100
															
 
																+#else
															
 
																+#define LOOPS_DEFAULT 100000
															
 
																+#endif
															
 
																+
															
 
																+int comp_double(const void*_a, const void*_b);
															
 
																+uint64_t bench_next_size(uint64_t len);
															
 
																+uint64_t bench_nb_iterations(int iterations, uint64_t len);
															
--- a/mpi/tests/sendrecv_bench.c
+++ b/mpi/tests/sendrecv_bench.c
@@ -18,84 +18,15 @@
 
																  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
															
 
																  */
															
 
																-#include <math.h>
															
 
																 #include <starpu_mpi.h>
															
 
																 #include "helper.h"
															
 
																+#include "abstract_sendrecv_bench.h"
															
 
																-#define NX_MAX (512 * 1024 * 1024) // kB
															
 
																-#define NX_MIN 0
															
 
																-#ifdef STARPU_QUICK_CHECK
															
 
																-#define MULT_DEFAULT 4
															
 
																-#else
															
 
																-#define MULT_DEFAULT 2
															
 
																-#endif
															
 
																-#define INCR_DEFAULT 0
															
 
																-#define NX_STEP 1.4 // multiplication
															
 
																-#ifdef STARPU_QUICK_CHECK
															
 
																-#define LOOPS_DEFAULT 100
															
 
																-#else
															
 
																-#define LOOPS_DEFAULT 10000
															
 
																-#endif
															
 
																-
															
 
																-int times_nb_nodes;
															
 
																-int times_size;
															
 
																-int worldsize;
															
 
																-
															
 
																-static int comp_double(const void*_a, const void*_b)
															
 
																-{
															
 
																-	const double* a = _a;
															
 
																-	const double* b = _b;
															
 
																-
															
 
																-	if(*a < *b)
															
 
																-		return -1;
															
 
																-	else if(*a > *b)
															
 
																-		return 1;
															
 
																-	else
															
 
																-		return 0;
															
 
																-}
															
 
																-
															
 
																-static inline uint64_t _next(uint64_t len, double multiplier, uint64_t increment)
															
 
																-{
															
 
																-	uint64_t next = len * multiplier + increment;
															
 
																-
															
 
																-	if(next <= len)
															
 
																-		next++;
															
 
																-
															
 
																-	return next;
															
 
																-}
															
 
																-
															
 
																-
															
 
																-static inline uint64_t _iterations(int iterations, uint64_t len)
															
 
																-{
															
 
																-	const uint64_t max_data = 512 * 1024 * 1024;
															
 
																-
															
 
																-	if(len <= 0)
															
 
																-		len = 1;
															
 
																-
															
 
																-	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
															
 
																-
															
 
																-	if(data_size  > max_data)
															
 
																-	{
															
 
																-		iterations = (max_data / (uint64_t)len);
															
 
																-		if(iterations < 2)
															
 
																-			iterations = 2;
															
 
																-	}
															
 
																-
															
 
																-	return iterations;
															
 
																-}
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	int ret, rank;
															
 
																-	starpu_data_handle_t handle_send, handle_recv;
															
 
																+	int ret, rank, worldsize;
															
 
																 	int mpi_init;
															
 
																-	float* vector_send = NULL;
															
 
																-	float* vector_recv = NULL;
															
 
																-	double t1, t2;
															
 
																-	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
															
 
																-	uint64_t iterations = LOOPS_DEFAULT;
															
 
																-	double multiplier = MULT_DEFAULT;
															
 
																-	uint64_t increment = INCR_DEFAULT;
															
 
																 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
@@ -115,108 +46,15 @@ int main(int argc, char **argv)
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
 
																-	if (rank >= 2)
															
 
																-	{
															
 
																-		starpu_pause();
															
 
																-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
															
 
																-		{
															
 
																-			iterations = _iterations(iterations, s);
															
 
																-
															
 
																-			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-
															
 
																-			for (uint64_t j = 0; j < iterations; j++)
															
 
																-			{
															
 
																-				starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-			}
															
 
																-		}
															
 
																-		starpu_resume();
															
 
																-
															
 
																-		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																-		return 0;
															
 
																-	}
															
 
																-
															
 
																-	if (rank == 0)
															
 
																-	{
															
 
																-		printf("Times in us\n");
															
 
																-		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
															
 
																-	}
															
 
																-
															
 
																-	int array_size = 0;
															
 
																-
															
 
																-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
															
 
																-	{
															
 
																-		vector_send = malloc(s);
															
 
																-		vector_recv = malloc(s);
															
 
																-		memset(vector_send, 0, s);
															
 
																-		memset(vector_recv, 0, s);
															
 
																-
															
 
																-		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
															
 
																-		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
															
 
																-
															
 
																-		iterations = _iterations(iterations, s);
															
 
																+	/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
															
 
																+	starpu_pause();
															
 
																-		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-
															
 
																-		for (uint64_t j = 0; j < iterations; j++)
															
 
																-		{
															
 
																-			if (rank == 0)
															
 
																-			{
															
 
																-				t1 = starpu_timing_now();
															
 
																-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
															
 
																-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
															
 
																-				t2 = starpu_timing_now();
															
 
																-
															
 
																-				const double delay = t2 - t1;
															
 
																-				const double t = delay / 2;
															
 
																-
															
 
																-				lats[j] = t;
															
 
																-			}
															
 
																-			else
															
 
																-			{
															
 
																-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
															
 
																-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
															
 
																-			}
															
 
																-
															
 
																-			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-		}
															
 
																-
															
 
																-		if (rank == 0)
															
 
																-		{
															
 
																-			qsort(lats, iterations, sizeof(double), &comp_double);
															
 
																-
															
 
																-			const double min_lat = lats[0];
															
 
																-			const double max_lat = lats[iterations - 1];
															
 
																-			const double med_lat = lats[(iterations - 1) / 2];
															
 
																-			const double d1_lat = lats[(iterations - 1) / 10];
															
 
																-			const double d9_lat = lats[9 * (iterations - 1) / 10];
															
 
																-			double avg_lat = 0.0;
															
 
																-
															
 
																-			for(uint64_t k = 0; k < iterations; k++)
															
 
																-			{
															
 
																-				avg_lat += lats[k];
															
 
																-			}
															
 
																-
															
 
																-			avg_lat /= iterations;
															
 
																-			const double bw_million_byte = s / min_lat;
															
 
																-			const double bw_mbyte        = bw_million_byte / 1.048576;
															
 
																-
															
 
																-			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
															
 
																-				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
															
 
																-			fflush(stdout);
															
 
																-		}
															
 
																-		starpu_data_unregister(handle_recv);
															
 
																-		starpu_data_unregister(handle_send);
															
 
																-
															
 
																-		free(vector_send);
															
 
																-		free(vector_recv);
															
 
																-	}
															
 
																+	sendrecv_bench(rank, NULL);
															
 
																+	starpu_resume();
															
 
																 	starpu_mpi_shutdown();
															
 
																 	if (!mpi_init)
															
 
																 		MPI_Finalize();
															
 
																-	free(lats);
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -0,0 +1,463 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * Simple *not distributed* parallel GEMM implementation and sendrecv bench at the same time.
															
 
																+ *
															
 
																+ * This bench is a merge of mpi/tests/sendrecv_bench and examples/mult/sgemm
															
 
																+ *
															
 
																+ * A *non-distributed* GEMM is computed on each node, while a sendrecv bench is running,
															
 
																+ * completely independently. The goal is to measure the impact of worker computations on
															
 
																+ * communications.
															
 
																+ *
															
 
																+ * Use the -nblocks parameter to define the matrix size (matrix size = nblocks * 320), such as
															
 
																+ * the GEMM finishes after the sendrecv bench.
															
 
																+ */
															
 
																+#include <limits.h>
															
 
																+#include <string.h>
															
 
																+#include <unistd.h>
															
 
																+#include <sys/types.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_fxt.h>
															
 
																+
															
 
																+#include <common/blas.h>
															
 
																+
															
 
																+#include "helper.h"
															
 
																+#include "abstract_sendrecv_bench.h"
															
 
																+#include "../../examples/mult/simple.h"
															
 
																+
															
 
																+#define CHECK_TASK_SUBMIT(ret) do {				\
															
 
																+	if (ret == -ENODEV)					\
															
 
																+	{							\
															
 
																+		ret = 77;					\
															
 
																+		goto enodev;					\
															
 
																+	}							\
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
															
 
																+} while(0)
															
 
																+
															
 
																+static int mpi_rank;
															
 
																+static int comm_thread_cpuid = -1;
															
 
																+static unsigned nslices = 4;
															
 
																+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
															
 
																+static unsigned matrix_dim = 256;
															
 
																+#else
															
 
																+static unsigned matrix_dim = 320 * 4;
															
 
																+#endif
															
 
																+static unsigned check = 0;
															
 
																+
															
 
																+static TYPE *A, *B, *C;
															
 
																+static starpu_data_handle_t A_handle, B_handle, C_handle;
															
 
																+
															
 
																+static starpu_pthread_barrier_t thread_barrier;
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+#define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
															
 
																+
															
 
																+static void check_output(void)
															
 
																+{
															
 
																+	/* compute C = C - AB */
															
 
																+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
															
 
																+
															
 
																+	/* make sure C = 0 */
															
 
																+	TYPE err;
															
 
																+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
															
 
																+
															
 
																+	if (err < matrix_dim*matrix_dim*0.001)
															
 
																+	{
															
 
																+		FPRINTF(stderr, "Results are OK\n");
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		int max;
															
 
																+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
															
 
																+
															
 
																+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
															
 
																+		FPRINTF(stderr, "Max error : %e\n", C[max]);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void init_problem_data(void)
															
 
																+{
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	unsigned i,j;
															
 
																+#endif
															
 
																+
															
 
																+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	/* fill the matrices */
															
 
																+	for (j=0; j < matrix_dim; j++)
															
 
																+	{
															
 
																+		for (i=0; i < matrix_dim; i++)
															
 
																+		{
															
 
																+			A[j+i*matrix_dim] = (TYPE)(starpu_drand48());
															
 
																+			B[j+i*matrix_dim] = (TYPE)(starpu_drand48());
															
 
																+			C[j+i*matrix_dim] = (TYPE)(0);
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static void partition_mult_data(void)
															
 
																+{
															
 
																+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
															
 
																+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
															
 
																+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
															
 
																+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
															
 
																+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
															
 
																+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
															
 
																+
															
 
																+	struct starpu_data_filter vert;
															
 
																+	memset(&vert, 0, sizeof(vert));
															
 
																+	vert.filter_func = starpu_matrix_filter_vertical_block;
															
 
																+	vert.nchildren = nslices;
															
 
																+
															
 
																+	struct starpu_data_filter horiz;
															
 
																+	memset(&horiz, 0, sizeof(horiz));
															
 
																+	horiz.filter_func = starpu_matrix_filter_block;
															
 
																+	horiz.nchildren = nslices;
															
 
																+
															
 
																+	starpu_data_partition(B_handle, &vert);
															
 
																+	starpu_data_partition(A_handle, &horiz);
															
 
																+
															
 
																+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void cpu_init_matrix_random(void *descr[], void *arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	for (unsigned i = 0; i < nx *ny; i++)
															
 
																+	{
															
 
																+		subA[i] = (TYPE) (starpu_drand48());
															
 
																+		subB[i] = (TYPE) (starpu_drand48());
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void cpu_init_matrix_zero(void *descr[], void *arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	for (unsigned i = 0; i < nx *ny; i++)
															
 
																+	{
															
 
																+		subA[i] = (TYPE) (0);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void cpu_mult(void *descr[], void *arg)
															
 
																+{
															
 
																+	(void)arg;
															
 
																+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																+
															
 
																+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																+
															
 
																+	int worker_size = starpu_combined_worker_get_size();
															
 
																+
															
 
																+	if (worker_size == 1)
															
 
																+	{
															
 
																+		/* Sequential CPU task */
															
 
																+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Parallel CPU task */
															
 
																+		unsigned rank = starpu_combined_worker_get_rank();
															
 
																+
															
 
																+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
															
 
																+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
															
 
																+
															
 
																+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
															
 
																+
															
 
																+		TYPE *new_subB = &subB[block_size*rank];
															
 
																+		TYPE *new_subC = &subC[block_size*rank];
															
 
																+
															
 
																+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static struct starpu_perfmodel starpu_gemm_model =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = STARPU_GEMM_STR(gemm)
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
															
 
																+	.max_parallelism = INT_MAX,
															
 
																+	.cpu_funcs = {cpu_mult},
															
 
																+	.cpu_funcs_name = {"cpu_mult"},
															
 
																+	.nbuffers = 3,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
															
 
																+	.model = &starpu_gemm_model
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl_init_matrix_random =
															
 
																+{
															
 
																+	.max_parallelism = INT_MAX,
															
 
																+	.cpu_funcs = {cpu_init_matrix_random},
															
 
																+	.cpu_funcs_name = {"cpu_init_matrix_random"},
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_W, STARPU_W}
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl_init_matrix_zero =
															
 
																+{
															
 
																+	.max_parallelism = INT_MAX,
															
 
																+	.cpu_funcs = {cpu_init_matrix_zero},
															
 
																+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_W}
															
 
																+};
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-nblocks") == 0)
															
 
																+		{
															
 
																+			char *argptr;
															
 
																+			nslices = strtol(argv[++i], &argptr, 10);
															
 
																+			matrix_dim = 320 * nslices;
															
 
																+		}
															
 
																+
															
 
																+		else if (strcmp(argv[i], "-size") == 0)
															
 
																+		{
															
 
																+			char *argptr;
															
 
																+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
															
 
																+			if (matrix_dim_tmp % 320 != 0)
															
 
																+			{
															
 
																+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				matrix_dim = matrix_dim_tmp;
															
 
																+				nslices = matrix_dim / 320;
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		else if (strcmp(argv[i], "-check") == 0)
															
 
																+		{
															
 
																+			check = 1;
															
 
																+		}
															
 
																+
															
 
																+		else if (strcmp(argv[i], "-spmd") == 0)
															
 
																+		{
															
 
																+			cl.type = STARPU_SPMD;
															
 
																+		}
															
 
																+
															
 
																+		else if (strcmp(argv[i], "-comm-thread-cpuid") == 0)
															
 
																+		{
															
 
																+			comm_thread_cpuid = atoi(argv[++i]);
															
 
																+		}
															
 
																+
															
 
																+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
															
 
																+		{
															
 
																+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm_thread_cpuid cpuid]\n", argv[0]);
															
 
																+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks\n", matrix_dim, nslices);
															
 
																+			fprintf(stderr, "Use -comm_thread_cpuid to specifiy where to bind the comm benchmarking thread\n");
															
 
																+			exit(EXIT_SUCCESS);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			fprintf(stderr,"Unrecognized option %s", argv[i]);
															
 
																+			exit(EXIT_FAILURE);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+static void* comm_thread_func(void* arg)
															
 
																+{
															
 
																+	if (comm_thread_cpuid < 0)
															
 
																+	{
															
 
																+		comm_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
															
 
																+	}
															
 
																+
															
 
																+	if (starpu_bind_thread_on(comm_thread_cpuid, 0, "Comm") < 0)
															
 
																+	{
															
 
																+		char hostname[65];
															
 
																+		gethostname(hostname, sizeof(hostname));
															
 
																+		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
															
 
																+	}
															
 
																+
															
 
																+	sendrecv_bench(mpi_rank, &thread_barrier);
															
 
																+
															
 
																+	return NULL;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	double start, end;
															
 
																+	int ret, mpi_init, worldsize;
															
 
																+	starpu_pthread_t comm_thread;
															
 
																+
															
 
																+	char hostname[255];
															
 
																+	gethostname(hostname, 255);
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	starpu_fxt_autostart_profiling(0);
															
 
																+
															
 
																+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																+
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																+
															
 
																+	if (worldsize < 2)
															
 
																+	{
															
 
																+		if (mpi_rank == 0)
															
 
																+			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																+
															
 
																+		starpu_mpi_shutdown();
															
 
																+		if (!mpi_init)
															
 
																+			MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	STARPU_PTHREAD_BARRIER_INIT(&thread_barrier, NULL, 2);
															
 
																+
															
 
																+
															
 
																+	// Start comm thread, benchmarking sendrecv:
															
 
																+	STARPU_PTHREAD_CREATE(&comm_thread, NULL, comm_thread_func, NULL);
															
 
																+
															
 
																+
															
 
																+	// Main thread will submit GEMM tasks:
															
 
																+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	partition_mult_data();
															
 
																+
															
 
																+
															
 
																+	if (mpi_rank == 0)
															
 
																+	{
															
 
																+		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
															
 
																+	}
															
 
																+
															
 
																+	starpu_pause();
															
 
																+
															
 
																+	unsigned x, y;
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	// Initialize matrices:
															
 
																+	for (x = 0; x < nslices; x++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+		task->cl = &cl_init_matrix_random;
															
 
																+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
															
 
																+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
															
 
																+		ret = starpu_task_submit(task);
															
 
																+		CHECK_TASK_SUBMIT(ret);
															
 
																+
															
 
																+		for (y = 0; y < nslices; y++)
															
 
																+		{
															
 
																+			task = starpu_task_create();
															
 
																+			task->cl = &cl_init_matrix_zero;
															
 
																+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
															
 
																+			ret = starpu_task_submit(task);
															
 
																+			CHECK_TASK_SUBMIT(ret);
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	for (x = 0; x < nslices; x++)
															
 
																+	for (y = 0; y < nslices; y++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+		task->cl = &cl;
															
 
																+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
															
 
																+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
															
 
																+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
															
 
																+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
															
 
																+
															
 
																+		ret = starpu_task_submit(task);
															
 
																+		CHECK_TASK_SUBMIT(ret);
															
 
																+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	starpu_fxt_start_profiling();
															
 
																+
															
 
																+	STARPU_PTHREAD_BARRIER_WAIT(&thread_barrier);
															
 
																+
															
 
																+	start = starpu_timing_now();
															
 
																+	starpu_resume();
															
 
																+	starpu_task_wait_for_all();
															
 
																+	end = starpu_timing_now();
															
 
																+	starpu_pause(); // Pause not to disturb comm thread if it isn't done
															
 
																+
															
 
																+	double timing = end - start;
															
 
																+	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
															
 
																+
															
 
																+	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
															
 
																+
															
 
																+
															
 
																+enodev:
															
 
																+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
															
 
																+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
															
 
																+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	starpu_data_unregister(A_handle);
															
 
																+	starpu_data_unregister(B_handle);
															
 
																+	starpu_data_unregister(C_handle);
															
 
																+
															
 
																+	if (check)
															
 
																+		check_output();
															
 
																+
															
 
																+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																+
															
 
																+
															
 
																+	// Wait comm thread:
															
 
																+	STARPU_PTHREAD_JOIN(comm_thread, NULL);
															
 
																+	STARPU_PTHREAD_BARRIER_DESTROY(&thread_barrier);
															
 
																+
															
 
																+	starpu_fxt_stop_profiling();
															
 
																+
															
 
																+	starpu_resume();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	if (!mpi_init)
															
 
																+		MPI_Finalize();
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -0,0 +1,215 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * sendrecv benchmark from different tasks, executed simultaneously on serveral
															
 
																+ * workers.
															
 
																+ * Inspired a lot from NewMadeleine examples/piom/nm_piom_pingpong.c
															
 
																+ *
															
 
																+ * The goal is to measure impact of calls to starpu_mpi_* from different threads.
															
 
																+ *
															
 
																+ * Use STARPU_NCPU to set the number of parallel ping pongs
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+#include "bench_helper.h"
															
 
																+#include "abstract_sendrecv_bench.h"
															
 
																+
															
 
																+#define NB_WARMUP_PINGPONGS 10
															
 
																+
															
 
																+/* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
															
 
																+#undef NX_MAX
															
 
																+#define NX_MAX (64 * 1024 * 1024)
															
 
																+
															
 
																+
															
 
																+void cpu_task(void* descr[], void* args)
															
 
																+{
															
 
																+	int mpi_rank;
															
 
																+	uint64_t iterations = LOOPS_DEFAULT / 100;
															
 
																+	uint64_t s;
															
 
																+	starpu_data_handle_t handle_send, handle_recv;
															
 
																+	double t1, t2;
															
 
																+	int asked_worker;
															
 
																+	int current_worker = starpu_worker_get_id();
															
 
																+
															
 
																+	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
															
 
																+
															
 
																+	STARPU_ASSERT(asked_worker == current_worker);
															
 
																+
															
 
																+	iterations = bench_nb_iterations(iterations, s);
															
 
																+	double* lats = malloc(sizeof(double) * iterations);
															
 
																+
															
 
																+	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
															
 
																+	{
															
 
																+		if (mpi_rank == 0)
															
 
																+		{
															
 
																+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
															
 
																+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for (uint64_t j = 0; j < iterations; j++)
															
 
																+	{
															
 
																+		if (mpi_rank == 0)
															
 
																+		{
															
 
																+			t1 = starpu_timing_now();
															
 
																+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
															
 
																+			t2 = starpu_timing_now();
															
 
																+
															
 
																+			lats[j] =  (t2 - t1) / 2;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
															
 
																+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	if (mpi_rank == 0)
															
 
																+	{
															
 
																+		qsort(lats, iterations, sizeof(double), &comp_double);
															
 
																+
															
 
																+		const double min_lat = lats[0];
															
 
																+		const double max_lat = lats[iterations - 1];
															
 
																+		const double med_lat = lats[(iterations - 1) / 2];
															
 
																+		const double d1_lat = lats[(iterations - 1) / 10];
															
 
																+		const double d9_lat = lats[9 * (iterations - 1) / 10];
															
 
																+		double avg_lat = 0.0;
															
 
																+
															
 
																+		for(uint64_t k = 0; k < iterations; k++)
															
 
																+		{
															
 
																+			avg_lat += lats[k];
															
 
																+		}
															
 
																+
															
 
																+		avg_lat /= iterations;
															
 
																+		const double bw_million_byte = s / min_lat;
															
 
																+		const double bw_mbyte        = bw_million_byte / 1.048576;
															
 
																+
															
 
																+		printf("%2d\t\t%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
															
 
																+			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
															
 
																+		fflush(stdout);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+	.cpu_funcs = { cpu_task },
															
 
																+	.cpu_funcs_name = { "cpu_task" },
															
 
																+	.nbuffers = 0
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, worldsize;
															
 
																+	int mpi_init;
															
 
																+
															
 
																+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																+
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																+
															
 
																+	if (worldsize < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																+
															
 
																+		starpu_mpi_shutdown();
															
 
																+		if (!mpi_init)
															
 
																+			MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		printf("Times in us\n");
															
 
																+		printf("# worker | size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
															
 
																+	}
															
 
																+	else if (rank >= 2)
															
 
																+	{
															
 
																+		starpu_mpi_shutdown();
															
 
																+		if (!mpi_init)
															
 
																+			MPI_Finalize();
															
 
																+		return 0;
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	unsigned cpu_count = starpu_cpu_worker_get_count();
															
 
																+	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
															
 
																+	unsigned tag = 0;
															
 
																+
															
 
																+	int* workers = malloc(cpu_count * sizeof(int));
															
 
																+	float** vectors_send = malloc(cpu_count * sizeof(float*));
															
 
																+	float** vectors_recv = malloc(cpu_count * sizeof(float*));
															
 
																+	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
															
 
																+	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
															
 
																+
															
 
																+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
															
 
																+	{
															
 
																+		starpu_pause();
															
 
																+
															
 
																+		for (int i = 0; i < cpu_count; i++)
															
 
																+		{
															
 
																+			workers[i] = i;
															
 
																+			vectors_send[i] = malloc(s);
															
 
																+			vectors_recv[i] = malloc(s);
															
 
																+			memset(vectors_send[i], 0, s);
															
 
																+			memset(vectors_recv[i], 0, s);
															
 
																+
															
 
																+			starpu_vector_data_register(&handles_send[i], STARPU_MAIN_RAM, (uintptr_t) vectors_send[i], s, 1);
															
 
																+			starpu_vector_data_register(&handles_recv[i], STARPU_MAIN_RAM, (uintptr_t) vectors_recv[i], s, 1);
															
 
																+
															
 
																+			starpu_task_insert(&cl,
															
 
																+					STARPU_EXECUTE_ON_WORKER, workers[i],
															
 
																+					STARPU_VALUE, &rank, sizeof(int),
															
 
																+					STARPU_VALUE, workers + i, sizeof(int),
															
 
																+					STARPU_VALUE, &s, sizeof(uint64_t),
															
 
																+					STARPU_VALUE, &handles_send[i], sizeof(starpu_data_handle_t),
															
 
																+					STARPU_VALUE, &handles_recv[i], sizeof(starpu_data_handle_t), 0);
															
 
																+		}
															
 
																+
															
 
																+		starpu_resume();
															
 
																+		starpu_task_wait_for_all();
															
 
																+
															
 
																+		for (unsigned i = 0; i < cpu_count; i++)
															
 
																+		{
															
 
																+			starpu_data_unregister(handles_send[i]);
															
 
																+			starpu_data_unregister(handles_recv[i]);
															
 
																+			free(vectors_send[i]);
															
 
																+			free(vectors_recv[i]);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	free(workers);
															
 
																+	free(vectors_send);
															
 
																+	free(vectors_recv);
															
 
																+	free(handles_send);
															
 
																+	free(handles_recv);
															
 
																+	free(mpi_tags);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	if (!mpi_init)
															
 
																+		MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -81,6 +81,20 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 
																 }
															
 
																 /*
															
 
																+ * PER WORKER model
															
 
																+ */
															
 
																+
															
 
																+static double per_worker_task_expected_perf(struct starpu_perfmodel *model, unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	double (*worker_cost_function)(struct starpu_task *task, unsigned workerid, unsigned nimpl);
															
 
																+
															
 
																+	worker_cost_function = model->worker_cost_function;
															
 
																+	STARPU_ASSERT_MSG(worker_cost_function, "STARPU_PER_WORKER needs worker_cost_function to be defined");
															
 
																+
															
 
																+	return worker_cost_function(task, workerid, nimpl);
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																  * PER ARCH model
															
 
																  */
															
@@ -156,6 +170,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
																 	switch (model->type)
															
 
																 	{
															
 
																+		case STARPU_PER_WORKER:
															
 
																 		case STARPU_PER_ARCH:
															
 
																 		case STARPU_COMMON:
															
 
																 			/* Nothing more to do than init */
															
@@ -220,6 +235,20 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
																 	return exp_perf;
															
 
																 }
															
 
																+static double starpu_model_worker_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
															
 
																+{
															
 
																+	if (!model)
															
 
																+		return 0.0;
															
 
																+
															
 
																+	if (model->type == STARPU_PER_WORKER)
															
 
																+		return per_worker_task_expected_perf(model, workerid, task, nimpl);
															
 
																+	else
															
 
																+	{
															
 
																+		struct starpu_perfmodel_arch *per_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
															
 
																+		return starpu_model_expected_perf(task, model, per_arch, nimpl);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
															
 
																 {
															
 
																 	if (!task->cl)
															
@@ -228,6 +257,14 @@ double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfm
 
																 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
															
 
																 }
															
 
																+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
															
 
																+{
															
 
																+	if (!task->cl)
															
 
																+		/* Tasks without codelet don't actually take time */
															
 
																+		return 0.0;
															
 
																+	return starpu_model_worker_expected_perf(task, task->cl->model, workerid, sched_ctx_id, nimpl);
															
 
																+}
															
 
																+
															
 
																 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
															
 
																 {
															
 
																 	if (!task->cl)
															
@@ -236,6 +273,15 @@ double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfm
 
																 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
															
 
																 }
															
 
																+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
															
 
																+{
															
 
																+	if (!task->cl)
															
 
																+		/* Tasks without codelet don't actually take time */
															
 
																+		return 0.0;
															
 
																+	return starpu_model_worker_expected_perf(task, task->cl->energy_model, workerid, sched_ctx_id, nimpl);
															
 
																+
															
 
																+}
															
 
																+
															
 
																 double starpu_task_expected_conversion_time(struct starpu_task *task,
															
 
																 					    struct starpu_perfmodel_arch* arch,
															
 
																 					    unsigned nimpl)
															
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -49,7 +49,6 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
																 	    workerid != -1;
															
 
																 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
															
 
																 	{
															
 
																-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
															
 
																 		int nimpl;
															
 
																 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
@@ -59,9 +58,13 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 
																 				double d;
															
 
																 				can_execute = 1;
															
 
																 				if(bundle)
															
 
																+				{
															
 
																+					struct starpu_perfmodel_arch* archtype =
															
 
																+						starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
															
 
																 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
															
 
																+				}
															
 
																 				else
															
 
																-					d = starpu_task_expected_length(task, archtype, nimpl);
															
 
																+					d = starpu_task_worker_expected_length(task, workerid, component->tree->sched_ctx_id, nimpl);
															
 
																 				if(isnan(d))
															
 
																 				{
															
 
																 					*length = d;
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -487,7 +487,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 			}
															
 
																 			double exp_end;
															
 
																-			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
															
 
																 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
															
 
																 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
@@ -679,9 +679,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 			}
															
 
																 			else
															
 
																 			{
															
 
																-				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
															
 
																 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
															
 
																-				local_energy[worker_ctx][nimpl] = starpu_task_expected_energy(task, perf_arch,nimpl);
															
 
																+				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
															
 
																 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
															
 
																 				if (conversion_time > 0.0)
															
 
																 					local_task_length[worker_ctx][nimpl] += conversion_time;
															
@@ -1100,10 +1100,9 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
																 {
															
 
																 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
 
																-	/* Compute the expected penality */
															
 
																-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
															
 
																-	double predicted = starpu_task_expected_length(task, perf_arch,
															
 
																+	/* Compute the expected penality */
															
 
																+	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
															
 
																 						       starpu_task_get_implementation(task));
															
 
																 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
															
--- a/tools/starpu_replay.c
+++ b/tools/starpu_replay.c
@@ -1085,13 +1085,23 @@ int main(int argc, char **argv)
 
																 		}
															
 
																 		else if (TEST("Sizes"))
															
 
																 		{
															
 
																+			*ln = 0;
															
 
																 			char *  buffer = s + 7;
															
 
																 			const char * delim = " ";
															
 
																-			char * token = strtok(buffer, delim);
															
 
																+			unsigned nb_parameters_line = count_number_tokens(buffer, delim); 
															
 
																 			unsigned k = 0;
															
 
																+			if(nb_parameters == 0)
															
 
																+			{
															
 
																+				nb_parameters = nb_parameters_line; 
															
 
																+				arrays_managing(set_alloc_mode(nb_parameters));
															
 
																+			}
															
 
																+			else
															
 
																+				STARPU_ASSERT(nb_parameters == nb_parameters_line);
															
 
																+
															
 
																 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
															
 
																+			char * token = strtok(buffer, delim);
															
 
																 			while (token != NULL && k < nb_parameters)
															
 
																 			{
															
 
																 				sizes_set[k] = strtol(token, NULL, 10);