Forráskód Böngészése

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Denis Barthou 5 éve
szülő
commit
ce489e4de4

+ 2 - 0
.gitignore

@@ -7,6 +7,8 @@
 /build
 /build2
 /build-aux
+/build_starpu
+/install
 /GPATH
 /GRTAGS
 /GTAGS

+ 1 - 0
ChangeLog

@@ -29,6 +29,7 @@ New features:
   * New number_events.data trace file which monitors number of events in trace
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
+  * New STARPU_PER_WORKER perfmodel.
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.

+ 167 - 249
configure.ac

@@ -92,6 +92,7 @@ if test x$enable_perf_debug = xyes; then
     enable_shared=no
 fi
 default_enable_mpi_check=maybe
+default_enable_mpi=maybe
 
 ###############################################################################
 #                                                                             #
@@ -206,6 +207,9 @@ if test x$enable_simgrid = xyes ; then
         # want that by default
 	default_enable_mpi_check=no
 
+	# disable MPI support by default
+	default_enable_mpi=no
+
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	AC_LANG_PUSH([C++])
 	if test x$enable_shared = xno ; then
@@ -270,145 +274,146 @@ fi
 
 ###############################################################################
 #                                                                             #
-#                                    MPI                                      #
+#                                LIBTOOLS                                     #
 #                                                                             #
 ###############################################################################
 
-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
-                              [Disable StarPU MPI library generation])],
-            [enable_mpi=$enableval],
-            [enable_mpi=yes])
+#c++11 detection
+AX_CXX_COMPILE_STDCXX(11,noext,optional)
 
-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
-                              [Enable StarPU to run with the master-slave mode])],
-            use_mpi_master_slave=$enableval,
-            use_mpi_master_slave=no)
+AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
+if test $HAVE_CXX11 -eq 1; then
+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
+fi
 
-#Check MPICC
-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
-           [Path of the mpicc compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
-       elif test x$withval = xno ; then
-           mpi_requested=no
-	   mpicc_path=""
-	   use_mpi=no
-       else
-	   mpi_requested=yes
-           mpicc_path=$withval
-       fi
-   ],
-   [
-       mpi_requested=maybe
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICC=smpicc
-       else
-           DEFAULT_MPICC=mpicc
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
-   ])
+LT_PREREQ([2.2])
+LT_INIT([win32-dll])
 
-# in case MPI was explicitely required, but is not available, this is an error
-if test x$mpi_requested = xyes -a ! -x "$mpicc_path"; then
-   AC_MSG_ERROR([Compiler MPI not valid])
-fi
+AC_HEADER_STDC
 
-if test x$mpi_requested != xno ; then
-   # We test if the MPICC compiler exists
-     if test ! -x $mpicc_path; then
-         #MPICC does not exists or is not executable
-	 AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
-	 use_mpi=no
-     else
-	 use_mpi=yes
-	 if test x$enable_simgrid = xyes ; then
-             AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
-						  [Path of the smpirun helper])],
-			 [
-			     if test x$withval = xyes; then
-				 AC_MSG_ERROR(--with-smpirun must be given a pathname)
-			     else
-				 smpirun_path=$withval
-			     fi
-			 ],
-			 [
-			     # nothing was specified: default value is used
-			     AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
-			 ])
-	 fi
-     fi
+AC_C_RESTRICT
+
+# Check if bash is available
+AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
+
+# Record git version
+AC_PATH_PROG(gitcommand, git)
+if test "$gitcommand" = "" ; then
+   if test -f $srcdir/STARPU-REVISION ; then
+      cp $srcdir/STARPU-REVISION .
+   else
+      echo "unknown" > ./STARPU-REVISION
+   fi
+else
+   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
 fi
 
-AC_MSG_CHECKING(mpicc path)
+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
+
+###############################################################################
+#                                                                             #
+#                           MPI compilers                                     #
+#                                                                             #
+###############################################################################
+
+#Check MPICC
+if test x$enable_simgrid = xyes ; then
+    DEFAULT_MPICC=smpicc
+else
+    DEFAULT_MPICC=mpicc
+fi
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
+AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+AC_MSG_CHECKING(whether mpicc is available)
 AC_MSG_RESULT($mpicc_path)
 AC_SUBST(MPICC, $mpicc_path)
 
-
 #Check MPICXX/MPIC++
-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
-           [Path of the mpicxx/mpic++ compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
-       else
-           mpicxx_path=$withval
-       fi
-   ],
-   [
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICXX=smpicxx
-       else
-           DEFAULT_MPICXX=mpicxx
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+if test x$enable_simgrid = xyes ; then
+    DEFAULT_MPICXX=smpicxx
+else
+    DEFAULT_MPICXX=mpicxx
+fi
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
+AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
 
-       # try with mpic++ if mpicxx was not found
-       if test x$mpicxx_path = xno ; then
-            DEFAULT_MPICXX=mpic++
-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
-       fi
-   ])
+# try with mpic++ if mpicxx was not found
+if test x$mpicxx_path = xno ; then
+    DEFAULT_MPICXX=mpic++
+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+fi
 
 # We test if the MPICXX/MPIC++ compiler exists
 if test ! -x $mpicxx_path; then
-    #MPICXX/MPIC++ does not exists or is not executable
     AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
-    use_mpicxx=no
-else
-    use_mpicxx=yes
+    mpicxx_path=no
 fi
 
-AC_MSG_CHECKING(mpicxx/mpic++ path)
+AC_MSG_CHECKING(whether mpicxx is available)
 AC_MSG_RESULT($mpicxx_path)
 AC_SUBST(MPICXX, $mpicxx_path)
 
+###############################################################################
+#                                                                             #
+#                                    MPI                                      #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
+                              [Disable StarPU MPI library generation])],
+            [enable_mpi=$enableval],
+            [enable_mpi=$default_enable_mpi])
 
-if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
-    cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-    cc_or_mpicc=$CC
+if test x$enable_mpi = xmaybe ; then
+    if test -x "$mpicc_path"; then
+	enable_mpi=yes
+    else
+	enable_mpi=no
+    fi
 fi
 
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+# in case MPI was explicitely required, but mpicc is not available, this is an error
+if test x$enable_mpi = xyes -a ! -x "$mpicc_path"; then
+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
+fi
 
-AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
-				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
-				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
-if test x$enable_mpi_pedantic_isend = xyes; then
-	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
+build_mpi_lib=$enable_mpi
+
+###############################################################################
+#                                                                             #
+#                                NEW MADELEINE                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
+		                    [Enable StarPU MPI library generation using the new madeleine backend])],
+            [enable_nmad=$enableval],
+            [enable_nmad=no])
+
+build_nmad_lib=no
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+#We can only build StarPU MPI Library if User wants it and MPI is available
+if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
+    build_nmad_lib=yes
+    build_mpi_lib=no
+    PKG_CHECK_MODULES([NMAD],[nmad])
+else
+    build_nmad_lib=no
 fi
 
-#We can only build MPI Master Slave if User wants it and MPI is available
-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
+###############################################################################
+#                                                                             #
+#                             MPI Master Slave                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
+                              [Enable StarPU to run with the master-slave mode])],
+              use_mpi_master_slave=$enableval,
+              use_mpi_master_slave=no)
+#We can only build MPI Master Slave if User wants it and MPI compiler are available
+if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
     build_mpi_master_slave=yes
 else
     build_mpi_master_slave=no
@@ -417,7 +422,9 @@ fi
 #users cannot use both at the same time
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
     AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
-	enable_mpi=no
+    build_mpi_lib=no
+    build_nmad_lib=no
+    enable_mpi=no
 fi
 
 if test x$build_mpi_master_slave = xyes; then
@@ -449,95 +456,19 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 AC_MSG_RESULT($nmaxmpidev)
 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
 
-
-###############################################################################
-#                                                                             #
-#                                NEW MADELEINE                                #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
-                              [Enable StarPU MPI library generation using new madeleine instead of mpi])],
-            [enable_nmad=$enableval],
-            [enable_nmad=no])
-
-if test x$use_mpi = xyes -a \( x$enable_nmad \) ; then
-    cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-    cc_or_mpicc=$CC
-fi
-
-build_nmad_lib=no
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
-#We can only build StarPU MPI Library if User wants it and MPI is available
-if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
-    build_nmad_lib=yes
-    enable_mpi=no
-    PKG_CHECK_MODULES([NMAD],[nmad])
-else
-    build_nmad_lib=no
-fi
-
-# in case NMAD was explicitely required, but the compiler MPI, this is an error
-if test x$enable_nmad = xyes -a ! -x "$mpicc_path"; then
-   AC_MSG_ERROR([Compiler MPI not valid])
-fi
-
-
-AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
-AC_MSG_RESULT($build_nmad_lib)
-
 ###############################################################################
 #                                                                             #
-#                                LIBTOOLS                                     #
+#                       Miscellaneous things for MPI                          #
 #                                                                             #
 ###############################################################################
 
-#c++11 detection
-AX_CXX_COMPILE_STDCXX(11,noext,optional)
-
-AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
-AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
-if test $HAVE_CXX11 -eq 1; then
-  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
-fi
-
-LT_PREREQ([2.2])
-LT_INIT([win32-dll])
-
-AC_HEADER_STDC
-
-AC_C_RESTRICT
-
-# Check if bash is available
-AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
-
-# Record git version
-AC_PATH_PROG(gitcommand, git)
-if test "$gitcommand" = "" ; then
-   if test -f $srcdir/STARPU-REVISION ; then
-      cp $srcdir/STARPU-REVISION .
-   else
-      echo "unknown" > ./STARPU-REVISION
-   fi
-else
-   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
+				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
+if test x$enable_mpi_pedantic_isend = xyes; then
+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
 fi
 
-AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
-
-###############################################################################
-#                                                                             #
-#                       Miscellaneous things for MPI                          #
-#                                                                             #
-###############################################################################
-
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
 	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
@@ -551,68 +482,45 @@ fi
 if test x$enable_mpi_check = xno ; then
     running_mpi_check=no
 fi
+if test x$enable_mpi = xno ; then
+    running_mpi_check=no
+fi
 
-
-if test x$enable_simgrid = xno ; then
+if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
     # Check if mpiexec is available
-    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
-                [Path of mpiexec])],
-        [
-            if test x$withval = xyes; then
-                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
-            else
-                mpiexec_path=$withval
-            fi
-        ],
-        [
-            # nothing was specified: look in the path
-	    if test x$mpicc_path = x ; then
-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$PATH])
-	    else
-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
-	    fi
-        ])
-
+    if test x$enable_simgrid = xyes ; then
+	DEFAULT_MPIEXEC=smpirun
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
+	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
+    else
+	DEFAULT_MPIEXEC=mpiexec
+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
+	if test x$mpicc_path = x ; then
+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
+	else
+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
+	fi
+    fi
     AC_MSG_CHECKING(whether mpiexec is available)
     AC_MSG_RESULT($mpiexec_path)
 
     # We test if MPIEXEC exists
     if test ! -x $mpiexec_path; then
-        # if it's not valid, it could be the parameter given to configure.ac was not a full path, let's look for it
-	if test x$mpicc_path = x ; then
-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$PATH])
-	else
-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$(dirname $mpicc_path):$PATH])
-	fi
-        AC_MSG_CHECKING(whether mpiexec is available (2nd try))
-        AC_MSG_RESULT($mpiexec_path_bis)
-	if test -x $mpiexec_path_bis; then
-	   mpiexec_path=$mpiexec_path_bis
-	else
-           #MPIEXEC does not exists or is not executable
-           AC_MSG_RESULT(The mpiexec script is not valid)
-           running_mpi_check=no
-           mpiexec_path=""
-	fi
+        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
+        running_mpi_check=no
+        mpiexec_path=""
     fi
     AC_SUBST(MPIEXEC,$mpiexec_path)
 fi
 
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$use_mpi = xyes ; then
-    AC_MSG_CHECKING(whether MPI tests should be run)
-    AC_MSG_RESULT($running_mpi_check)
-fi
-
-#We can only build StarPU MPI Library if User wants it and MPI is available
-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
-    build_mpi_lib=yes
-else
-    build_mpi_lib=no
-fi
+AC_MSG_CHECKING(whether MPI tests should be run)
+AC_MSG_RESULT($running_mpi_check)
 
 AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
 AC_MSG_RESULT($build_mpi_lib)
+AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
+AC_MSG_RESULT($build_nmad_lib)
 
 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes)
 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
@@ -622,11 +530,9 @@ if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
 	else
 		AC_DEFINE(STARPU_USE_MPI_NMAD,[1],[whether the StarPU MPI library (with a NewMadeleine implementation) is available])
 	fi
-else
-	running_mpi_check=no
 fi
 
-if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
+if test x$enable_mpi = xyes ; then
     if test x$enable_simgrid = xyes ; then
         if test x$enable_shared = xyes ; then
 	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
@@ -644,17 +550,16 @@ AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
 
 AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
-			[Arguments for mpiexec])],
-	[
+					  [Arguments for mpiexec])],
+	    [
 		mpiexec_args=$withval
-	])
+	    ])
 AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
 
-
 AC_MSG_CHECKING(whether MPI debug messages should be displayed)
 AC_ARG_ENABLE(mpi-verbose, [AS_HELP_STRING([--enable-mpi-verbose],
-			[display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
-			enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
+					   [display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
+	      enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
 AC_MSG_RESULT($enable_mpi_verbose)
 if test x$enable_mpi_verbose = xyes; then
 	AC_DEFINE(STARPU_MPI_VERBOSE, [1], [display MPI verbose debug messages])
@@ -664,6 +569,19 @@ if test x$enable_mpi_verbose = xextra; then
 	AC_DEFINE(STARPU_MPI_EXTRA_VERBOSE, [1], [display MPI verbose debug messages])
 fi
 
+if test x$enable_mpi = xyes -o x$build_mpi_master_slave = xyes ; then
+    cc_or_mpicc=$mpicc_path
+    # For some reason, libtool uses gcc instead of mpicc when linking
+    # libstarpumpi.
+    # On Darwin (and maybe other systems ?) the linker will fail (undefined
+    # references to MPI_*). We manually add the required flags to fix this
+    # issue.
+    AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+else
+    cc_or_mpicc=$CC
+fi
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+
 ###############################################################################
 #                                                                             #
 #                           MIC device compilation                            #

+ 1 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -602,7 +602,7 @@ whole machine, it would not be efficient to accumulate them in only one place,
 incurring data transmission each and access concurrency.
 
 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
-this case: it will allocate a buffer on each memory node, and accumulate
+this case: it will allocate a buffer on each worker (lazily), and accumulate
 intermediate results there. When the data is eventually accessed in the normal
 mode ::STARPU_R, StarPU will collect the intermediate results in just one
 buffer.

+ 4 - 3
doc/doxygen/chapters/320_scheduling.doxy

@@ -190,9 +190,10 @@ single task gives the consumption of the task in Joules, which can be given to
 starpu_perfmodel_update_history().
 
 Another way to provide the energy performance is to define a
-perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
-starpu_perfmodel::arch_cost_function field to a function which shall return the
-estimated consumption of the task in Joules. Such a function can for instance
+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH or
+::STARPU_PER_WORKER , and set the starpu_perfmodel::arch_cost_function or
+starpu_perfmodel::worker_cost_function field to a function which shall return
+the estimated consumption of the task in Joules. Such a function can for instance
 use starpu_task_expected_length() on the task (in µs), multiplied by the
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 get Joules.

+ 2 - 1
doc/doxygen/chapters/350_scheduling_policy_definition.doxy

@@ -45,7 +45,8 @@ provides a complete list of the functions available for writing advanced schedul
 This includes getting an estimation for a task computation completion with
 starpu_task_expected_length(), for the required data transfers with
 starpu_task_expected_data_transfer_time_for(), for the required energy with
-starpu_task_expected_energy(), etc. Other
+starpu_task_expected_energy(), etc. Per-worker variants are also available with
+starpu_task_worker_expected_length(), etc. Other
 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
 starpu_transfer_predict(), ...
 One can also directly test the presence of a data handle with starpu_data_is_on_node().

+ 5 - 0
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -401,6 +401,11 @@ filled with pointers to functions which return the expected duration
 of the task in micro-seconds, one per architecture, see for instance
 <c>tests/datawizard/locality.c</c>
 </li>
+
+<li>
+Provided explicitly by the application (model type ::STARPU_PER_WORKER)
+similarly with the starpu_perfmodel::worker_cost_function field.
+</li>
 </ul>
 
 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and

+ 8 - 1
include/starpu_perfmodel.h

@@ -165,6 +165,7 @@ struct starpu_perfmodel_per_arch
 enum starpu_perfmodel_type
 {
         STARPU_PERFMODEL_INVALID=0,
+	STARPU_PER_WORKER,                /**< Application-provided per-worker cost model function */
 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
@@ -226,11 +227,17 @@ struct starpu_perfmodel
 	*/
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	/**
-	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
+	   Used by ::STARPU_PER_ARCH. Take a task, an arch and implementation
 	   number, and must return a task duration estimation in
 	   micro-seconds on that arch.
 	*/
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
+	/**
+	   Used by ::STARPU_PER_WORKER. Take a task, a worker id and implementation
+	   number, and must return a task duration estimation in
+	   micro-seconds on that worker.
+	*/
+	double (*worker_cost_function)(struct starpu_task *, unsigned workerid, unsigned nimpl);
 
 	/**
 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and

+ 14 - 0
include/starpu_scheduler.h

@@ -110,6 +110,10 @@ struct starpu_sched_policy
 	   to be executed by the worker. This method therefore permits
 	   to keep the state of the scheduler coherent even when
 	   StarPU bypasses the scheduling strategy.
+
+	   Note: to get an estimation of the task duration, \p perf_workerid
+	   needs to be used rather than \p workerid, for the case of parallel
+	   tasks.
 	*/
 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 
@@ -366,6 +370,11 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 /**
+   Same as starpu_task_expected_length() but for a precise worker.
+*/
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return an estimated speedup factor relative to CPU speed
 */
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
@@ -395,6 +404,11 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 /**
+   Same as starpu_task_expected_energy but for a precise worker
+*/
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return expected conversion time in ms (multiformat interface only)
 */
 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);

+ 3 - 3
mpi/include/starpu_mpi.h

@@ -50,9 +50,9 @@ extern "C"
 int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
 
 /**
-   Same as starpu_mpi_init_conf(), except that this does not
-   initialize the StarPU library. The caller thus has to call
-   starpu_init() before this.
+   Same as starpu_mpi_init_conf(), except that this does not initialize the
+   StarPU library. The caller thus has to call starpu_init() before this, and it
+   can not reserve a core for the MPI communications.
 */
 int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
 

+ 28 - 2
mpi/tests/Makefile.am

@@ -137,7 +137,13 @@ starpu_mpi_TESTS +=				\
 	temporary				\
 	user_defined_datatype			\
 	early_stuff				\
-	sendrecv_bench
+	sendrecv_bench				\
+	sendrecv_parallel_tasks_bench
+
+if !NO_BLAS_LIB
+starpu_mpi_TESTS +=				\
+	sendrecv_gemm_bench
+endif
 
 if !STARPU_SIMGRID
 # missing support in simgrid
@@ -226,7 +232,9 @@ noinst_PROGRAMS =				\
 	starpu_redefine				\
 	load_balancer				\
 	driver					\
-	sendrecv_bench
+	sendrecv_bench				\
+	sendrecv_gemm_bench			\
+	sendrecv_parallel_tasks_bench
 
 XFAIL_TESTS=					\
 	policy_register_toomany			\
@@ -256,4 +264,22 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
+
+sendrecv_bench_SOURCES = sendrecv_bench.c
+sendrecv_bench_SOURCES += bench_helper.c
+sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
+
+sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
+sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
+sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
+
+if !NO_BLAS_LIB
+sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
+sendrecv_gemm_bench_SOURCES += bench_helper.c
+sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
+sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
+
+sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
+endif
+
 endif

+ 136 - 0
mpi/tests/abstract_sendrecv_bench.c

@@ -0,0 +1,136 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "bench_helper.h"
+#include "abstract_sendrecv_bench.h"
+
+
+
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
+{
+	uint64_t iterations = LOOPS_DEFAULT;
+
+	if (mpi_rank >= 2)
+	{
+		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+		{
+			iterations = bench_nb_iterations(iterations, s);
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+
+			for (uint64_t j = 0; j < iterations; j++)
+			{
+				starpu_mpi_barrier(MPI_COMM_WORLD);
+			}
+		}
+
+		return;
+	}
+
+	if (mpi_rank == 0)
+	{
+		printf("Times in us\n");
+		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
+	}
+
+	int array_size = 0;
+	starpu_data_handle_t handle_send, handle_recv;
+	float* vector_send = NULL;
+	float* vector_recv = NULL;
+	double t1, t2, global_tstart, global_tend;
+	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
+
+	if (thread_barrier != NULL)
+	{
+		STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
+	}
+
+	global_tstart = starpu_timing_now();
+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	{
+		vector_send = malloc(s);
+		vector_recv = malloc(s);
+		memset(vector_send, 0, s);
+		memset(vector_recv, 0, s);
+
+		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
+		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
+
+		iterations = bench_nb_iterations(iterations, s);
+
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+
+		for (uint64_t j = 0; j < iterations; j++)
+		{
+			if (mpi_rank == 0)
+			{
+				t1 = starpu_timing_now();
+				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				t2 = starpu_timing_now();
+
+				const double t = (t2 -t1) / 2;
+
+				lats[j] = t;
+			}
+			else
+			{
+				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+			}
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+		}
+
+		if (mpi_rank == 0)
+		{
+			qsort(lats, iterations, sizeof(double), &comp_double);
+
+			const double min_lat = lats[0];
+			const double max_lat = lats[iterations - 1];
+			const double med_lat = lats[(iterations - 1) / 2];
+			const double d1_lat = lats[(iterations - 1) / 10];
+			const double d9_lat = lats[9 * (iterations - 1) / 10];
+			double avg_lat = 0.0;
+
+			for(uint64_t k = 0; k < iterations; k++)
+			{
+				avg_lat += lats[k];
+			}
+
+			avg_lat /= iterations;
+			const double bw_million_byte = s / min_lat;
+			const double bw_mbyte        = bw_million_byte / 1.048576;
+
+			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
+				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
+			fflush(stdout);
+		}
+		starpu_data_unregister(handle_recv);
+		starpu_data_unregister(handle_send);
+
+		free(vector_send);
+		free(vector_recv);
+	}
+	global_tend = starpu_timing_now();
+
+	if (mpi_rank == 0)
+	{
+		printf("Comm bench took %9.3lf ms\n", (global_tend - global_tstart) / 1000);
+	}
+
+	free(lats);
+}

+ 21 - 0
mpi/tests/abstract_sendrecv_bench.h

@@ -0,0 +1,21 @@
+
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);

+ 62 - 0
mpi/tests/bench_helper.c

@@ -0,0 +1,62 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "bench_helper.h"
+
+
+int comp_double(const void*_a, const void*_b)
+{
+	const double* a = _a;
+	const double* b = _b;
+
+	if(*a < *b)
+		return -1;
+	else if(*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+
+uint64_t bench_next_size(uint64_t len)
+{
+	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
+
+	if(next <= len)
+		next++;
+
+	return next;
+}
+
+
+uint64_t bench_nb_iterations(int iterations, uint64_t len)
+{
+	const uint64_t max_data = NX_MAX;
+
+	if(len <= 0)
+		len = 1;
+
+	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
+
+	if(data_size  > max_data)
+	{
+		iterations = (max_data / (uint64_t)len);
+		if(iterations < 2)
+			iterations = 2;
+	}
+
+	return iterations;
+}

+ 37 - 0
mpi/tests/bench_helper.h

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NX_MAX (512 * 1024 * 1024) // kB
+#define NX_MIN 0
+#ifdef STARPU_QUICK_CHECK
+#define MULT_DEFAULT 4
+#else
+#define MULT_DEFAULT 2
+#endif
+#define INCR_DEFAULT 0
+#ifdef STARPU_QUICK_CHECK
+#define LOOPS_DEFAULT 100
+#else
+#define LOOPS_DEFAULT 100000
+#endif
+
+int comp_double(const void*_a, const void*_b);
+uint64_t bench_next_size(uint64_t len);
+uint64_t bench_nb_iterations(int iterations, uint64_t len);

+ 6 - 168
mpi/tests/sendrecv_bench.c

@@ -18,84 +18,15 @@
  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
  */
 
-#include <math.h>
 #include <starpu_mpi.h>
 #include "helper.h"
+#include "abstract_sendrecv_bench.h"
 
-#define NX_MAX (512 * 1024 * 1024) // kB
-#define NX_MIN 0
-#ifdef STARPU_QUICK_CHECK
-#define MULT_DEFAULT 4
-#else
-#define MULT_DEFAULT 2
-#endif
-#define INCR_DEFAULT 0
-#define NX_STEP 1.4 // multiplication
-#ifdef STARPU_QUICK_CHECK
-#define LOOPS_DEFAULT 100
-#else
-#define LOOPS_DEFAULT 10000
-#endif
-
-int times_nb_nodes;
-int times_size;
-int worldsize;
-
-static int comp_double(const void*_a, const void*_b)
-{
-	const double* a = _a;
-	const double* b = _b;
-
-	if(*a < *b)
-		return -1;
-	else if(*a > *b)
-		return 1;
-	else
-		return 0;
-}
-
-static inline uint64_t _next(uint64_t len, double multiplier, uint64_t increment)
-{
-	uint64_t next = len * multiplier + increment;
-
-	if(next <= len)
-		next++;
-
-	return next;
-}
-
-
-static inline uint64_t _iterations(int iterations, uint64_t len)
-{
-	const uint64_t max_data = 512 * 1024 * 1024;
-
-	if(len <= 0)
-		len = 1;
-
-	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
-
-	if(data_size  > max_data)
-	{
-		iterations = (max_data / (uint64_t)len);
-		if(iterations < 2)
-			iterations = 2;
-	}
-
-	return iterations;
-}
 
 int main(int argc, char **argv)
 {
-	int ret, rank;
-	starpu_data_handle_t handle_send, handle_recv;
+	int ret, rank, worldsize;
 	int mpi_init;
-	float* vector_send = NULL;
-	float* vector_recv = NULL;
-	double t1, t2;
-	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
-	uint64_t iterations = LOOPS_DEFAULT;
-	double multiplier = MULT_DEFAULT;
-	uint64_t increment = INCR_DEFAULT;
 
 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
@@ -115,108 +46,15 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	}
 
-	if (rank >= 2)
-	{
-		starpu_pause();
-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
-		{
-			iterations = _iterations(iterations, s);
-
-			starpu_mpi_barrier(MPI_COMM_WORLD);
-
-			for (uint64_t j = 0; j < iterations; j++)
-			{
-				starpu_mpi_barrier(MPI_COMM_WORLD);
-			}
-		}
-		starpu_resume();
-
-		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
-		return 0;
-	}
-
-	if (rank == 0)
-	{
-		printf("Times in us\n");
-		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
-	}
-
-	int array_size = 0;
-
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
-	{
-		vector_send = malloc(s);
-		vector_recv = malloc(s);
-		memset(vector_send, 0, s);
-		memset(vector_recv, 0, s);
-
-		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
-		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
-
-		iterations = _iterations(iterations, s);
+	/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
+	starpu_pause();
 
-		starpu_mpi_barrier(MPI_COMM_WORLD);
-
-		for (uint64_t j = 0; j < iterations; j++)
-		{
-			if (rank == 0)
-			{
-				t1 = starpu_timing_now();
-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
-				t2 = starpu_timing_now();
-
-				const double delay = t2 - t1;
-				const double t = delay / 2;
-
-				lats[j] = t;
-			}
-			else
-			{
-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
-			}
-
-			starpu_mpi_barrier(MPI_COMM_WORLD);
-		}
-
-		if (rank == 0)
-		{
-			qsort(lats, iterations, sizeof(double), &comp_double);
-
-			const double min_lat = lats[0];
-			const double max_lat = lats[iterations - 1];
-			const double med_lat = lats[(iterations - 1) / 2];
-			const double d1_lat = lats[(iterations - 1) / 10];
-			const double d9_lat = lats[9 * (iterations - 1) / 10];
-			double avg_lat = 0.0;
-
-			for(uint64_t k = 0; k < iterations; k++)
-			{
-				avg_lat += lats[k];
-			}
-
-			avg_lat /= iterations;
-			const double bw_million_byte = s / min_lat;
-			const double bw_mbyte        = bw_million_byte / 1.048576;
-
-			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
-				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
-			fflush(stdout);
-		}
-		starpu_data_unregister(handle_recv);
-		starpu_data_unregister(handle_send);
-
-		free(vector_send);
-		free(vector_recv);
-	}
+	sendrecv_bench(rank, NULL);
 
+	starpu_resume();
 	starpu_mpi_shutdown();
 	if (!mpi_init)
 		MPI_Finalize();
 
-	free(lats);
 	return 0;
 }

+ 463 - 0
mpi/tests/sendrecv_gemm_bench.c

@@ -0,0 +1,463 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Simple *not distributed* parallel GEMM implementation and sendrecv bench at the same time.
+ *
+ * This bench is a merge of mpi/tests/sendrecv_bench and examples/mult/sgemm
+ *
+ * A *non-distributed* GEMM is computed on each node, while a sendrecv bench is running,
+ * completely independently. The goal is to measure the impact of worker computations on
+ * communications.
+ *
+ * Use the -nblocks parameter to define the matrix size (matrix size = nblocks * 320), such as
+ * the GEMM finishes after the sendrecv bench.
+ */
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <starpu_mpi.h>
+#include <starpu_fxt.h>
+
+#include <common/blas.h>
+
+#include "helper.h"
+#include "abstract_sendrecv_bench.h"
+#include "../../examples/mult/simple.h"
+
+#define CHECK_TASK_SUBMIT(ret) do {				\
+	if (ret == -ENODEV)					\
+	{							\
+		ret = 77;					\
+		goto enodev;					\
+	}							\
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
+} while(0)
+
+static int mpi_rank;
+static int comm_thread_cpuid = -1;
+static unsigned nslices = 4;
+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
+static unsigned matrix_dim = 256;
+#else
+static unsigned matrix_dim = 320 * 4;
+#endif
+static unsigned check = 0;
+
+static TYPE *A, *B, *C;
+static starpu_data_handle_t A_handle, B_handle, C_handle;
+
+static starpu_pthread_barrier_t thread_barrier;
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
+
+static void check_output(void)
+{
+	/* compute C = C - AB */
+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
+
+	/* make sure C = 0 */
+	TYPE err;
+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
+
+	if (err < matrix_dim*matrix_dim*0.001)
+	{
+		FPRINTF(stderr, "Results are OK\n");
+	}
+	else
+	{
+		int max;
+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
+
+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
+		FPRINTF(stderr, "Max error : %e\n", C[max]);
+	}
+}
+
+static void init_problem_data(void)
+{
+#ifndef STARPU_SIMGRID
+	unsigned i,j;
+#endif
+
+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+
+#ifndef STARPU_SIMGRID
+	/* fill the matrices */
+	for (j=0; j < matrix_dim; j++)
+	{
+		for (i=0; i < matrix_dim; i++)
+		{
+			A[j+i*matrix_dim] = (TYPE)(starpu_drand48());
+			B[j+i*matrix_dim] = (TYPE)(starpu_drand48());
+			C[j+i*matrix_dim] = (TYPE)(0);
+		}
+	}
+#endif
+}
+
+static void partition_mult_data(void)
+{
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+
+	struct starpu_data_filter vert;
+	memset(&vert, 0, sizeof(vert));
+	vert.filter_func = starpu_matrix_filter_vertical_block;
+	vert.nchildren = nslices;
+
+	struct starpu_data_filter horiz;
+	memset(&horiz, 0, sizeof(horiz));
+	horiz.filter_func = starpu_matrix_filter_block;
+	horiz.nchildren = nslices;
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+
+void cpu_init_matrix_random(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (starpu_drand48());
+		subB[i] = (TYPE) (starpu_drand48());
+	}
+}
+
+
+void cpu_init_matrix_zero(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (0);
+	}
+}
+
+
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	int worker_size = starpu_combined_worker_get_size();
+
+	if (worker_size == 1)
+	{
+		/* Sequential CPU task */
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+	}
+	else
+	{
+		/* Parallel CPU task */
+		unsigned rank = starpu_combined_worker_get_rank();
+
+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
+
+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
+
+		TYPE *new_subB = &subB[block_size*rank];
+		TYPE *new_subC = &subC[block_size*rank];
+
+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
+	}
+}
+
+static struct starpu_perfmodel starpu_gemm_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = STARPU_GEMM_STR(gemm)
+};
+
+static struct starpu_codelet cl =
+{
+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &starpu_gemm_model
+};
+
+static struct starpu_codelet cl_init_matrix_random =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_random},
+	.cpu_funcs_name = {"cpu_init_matrix_random"},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet cl_init_matrix_zero =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_zero},
+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			char *argptr;
+			nslices = strtol(argv[++i], &argptr, 10);
+			matrix_dim = 320 * nslices;
+		}
+
+		else if (strcmp(argv[i], "-size") == 0)
+		{
+			char *argptr;
+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
+			if (matrix_dim_tmp % 320 != 0)
+			{
+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
+			}
+			else
+			{
+				matrix_dim = matrix_dim_tmp;
+				nslices = matrix_dim / 320;
+			}
+		}
+
+		else if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		else if (strcmp(argv[i], "-spmd") == 0)
+		{
+			cl.type = STARPU_SPMD;
+		}
+
+		else if (strcmp(argv[i], "-comm-thread-cpuid") == 0)
+		{
+			comm_thread_cpuid = atoi(argv[++i]);
+		}
+
+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
+		{
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm_thread_cpuid cpuid]\n", argv[0]);
+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks\n", matrix_dim, nslices);
+			fprintf(stderr, "Use -comm_thread_cpuid to specifiy where to bind the comm benchmarking thread\n");
+			exit(EXIT_SUCCESS);
+		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s", argv[i]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+
+static void* comm_thread_func(void* arg)
+{
+	if (comm_thread_cpuid < 0)
+	{
+		comm_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
+	}
+
+	if (starpu_bind_thread_on(comm_thread_cpuid, 0, "Comm") < 0)
+	{
+		char hostname[65];
+		gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
+	}
+
+	sendrecv_bench(mpi_rank, &thread_barrier);
+
+	return NULL;
+}
+
+
+int main(int argc, char **argv)
+{
+	double start, end;
+	int ret, mpi_init, worldsize;
+	starpu_pthread_t comm_thread;
+
+	char hostname[255];
+	gethostname(hostname, 255);
+
+	parse_args(argc, argv);
+
+	starpu_fxt_autostart_profiling(0);
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	if (worldsize < 2)
+	{
+		if (mpi_rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+
+	STARPU_PTHREAD_BARRIER_INIT(&thread_barrier, NULL, 2);
+
+
+	// Start comm thread, benchmarking sendrecv:
+	STARPU_PTHREAD_CREATE(&comm_thread, NULL, comm_thread_func, NULL);
+
+
+	// Main thread will submit GEMM tasks:
+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	partition_mult_data();
+
+
+	if (mpi_rank == 0)
+	{
+		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
+	}
+
+	starpu_pause();
+
+	unsigned x, y;
+#ifndef STARPU_SIMGRID
+	// Initialize matrices:
+	for (x = 0; x < nslices; x++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl_init_matrix_random;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+
+		for (y = 0; y < nslices; y++)
+		{
+			task = starpu_task_create();
+			task->cl = &cl_init_matrix_zero;
+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
+			ret = starpu_task_submit(task);
+			CHECK_TASK_SUBMIT(ret);
+		}
+	}
+#endif
+
+	for (x = 0; x < nslices; x++)
+	for (y = 0; y < nslices; y++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
+
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	starpu_fxt_start_profiling();
+
+	STARPU_PTHREAD_BARRIER_WAIT(&thread_barrier);
+
+	start = starpu_timing_now();
+	starpu_resume();
+	starpu_task_wait_for_all();
+	end = starpu_timing_now();
+	starpu_pause(); // Pause not to disturb comm thread if it isn't done
+
+	double timing = end - start;
+	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
+
+	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
+
+
+enodev:
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+
+	if (check)
+		check_output();
+
+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+
+
+	// Wait comm thread:
+	STARPU_PTHREAD_JOIN(comm_thread, NULL);
+	STARPU_PTHREAD_BARRIER_DESTROY(&thread_barrier);
+
+	starpu_fxt_stop_profiling();
+
+	starpu_resume();
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return ret;
+}

+ 215 - 0
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -0,0 +1,215 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * sendrecv benchmark from different tasks, executed simultaneously on serveral
+ * workers.
+ * Inspired a lot from NewMadeleine examples/piom/nm_piom_pingpong.c
+ *
+ * The goal is to measure impact of calls to starpu_mpi_* from different threads.
+ *
+ * Use STARPU_NCPU to set the number of parallel ping pongs
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+#include "bench_helper.h"
+#include "abstract_sendrecv_bench.h"
+
+#define NB_WARMUP_PINGPONGS 10
+
+/* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
+#undef NX_MAX
+#define NX_MAX (64 * 1024 * 1024)
+
+
+void cpu_task(void* descr[], void* args)
+{
+	int mpi_rank;
+	uint64_t iterations = LOOPS_DEFAULT / 100;
+	uint64_t s;
+	starpu_data_handle_t handle_send, handle_recv;
+	double t1, t2;
+	int asked_worker;
+	int current_worker = starpu_worker_get_id();
+
+	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
+
+	STARPU_ASSERT(asked_worker == current_worker);
+
+	iterations = bench_nb_iterations(iterations, s);
+	double* lats = malloc(sizeof(double) * iterations);
+
+	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
+	{
+		if (mpi_rank == 0)
+		{
+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+		}
+		else
+		{
+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+		}
+	}
+
+	for (uint64_t j = 0; j < iterations; j++)
+	{
+		if (mpi_rank == 0)
+		{
+			t1 = starpu_timing_now();
+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+			t2 = starpu_timing_now();
+
+			lats[j] =  (t2 - t1) / 2;
+		}
+		else
+		{
+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+		}
+	}
+
+	if (mpi_rank == 0)
+	{
+		qsort(lats, iterations, sizeof(double), &comp_double);
+
+		const double min_lat = lats[0];
+		const double max_lat = lats[iterations - 1];
+		const double med_lat = lats[(iterations - 1) / 2];
+		const double d1_lat = lats[(iterations - 1) / 10];
+		const double d9_lat = lats[9 * (iterations - 1) / 10];
+		double avg_lat = 0.0;
+
+		for(uint64_t k = 0; k < iterations; k++)
+		{
+			avg_lat += lats[k];
+		}
+
+		avg_lat /= iterations;
+		const double bw_million_byte = s / min_lat;
+		const double bw_mbyte        = bw_million_byte / 1.048576;
+
+		printf("%2d\t\t%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
+			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
+		fflush(stdout);
+	}
+}
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = { cpu_task },
+	.cpu_funcs_name = { "cpu_task" },
+	.nbuffers = 0
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, worldsize;
+	int mpi_init;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	if (worldsize < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	if (rank == 0)
+	{
+		printf("Times in us\n");
+		printf("# worker | size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
+	}
+	else if (rank >= 2)
+	{
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return 0;
+	}
+
+
+	unsigned cpu_count = starpu_cpu_worker_get_count();
+	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
+	unsigned tag = 0;
+
+	int* workers = malloc(cpu_count * sizeof(int));
+	float** vectors_send = malloc(cpu_count * sizeof(float*));
+	float** vectors_recv = malloc(cpu_count * sizeof(float*));
+	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
+	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
+
+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	{
+		starpu_pause();
+
+		for (int i = 0; i < cpu_count; i++)
+		{
+			workers[i] = i;
+			vectors_send[i] = malloc(s);
+			vectors_recv[i] = malloc(s);
+			memset(vectors_send[i], 0, s);
+			memset(vectors_recv[i], 0, s);
+
+			starpu_vector_data_register(&handles_send[i], STARPU_MAIN_RAM, (uintptr_t) vectors_send[i], s, 1);
+			starpu_vector_data_register(&handles_recv[i], STARPU_MAIN_RAM, (uintptr_t) vectors_recv[i], s, 1);
+
+			starpu_task_insert(&cl,
+					STARPU_EXECUTE_ON_WORKER, workers[i],
+					STARPU_VALUE, &rank, sizeof(int),
+					STARPU_VALUE, workers + i, sizeof(int),
+					STARPU_VALUE, &s, sizeof(uint64_t),
+					STARPU_VALUE, &handles_send[i], sizeof(starpu_data_handle_t),
+					STARPU_VALUE, &handles_recv[i], sizeof(starpu_data_handle_t), 0);
+		}
+
+		starpu_resume();
+		starpu_task_wait_for_all();
+
+		for (unsigned i = 0; i < cpu_count; i++)
+		{
+			starpu_data_unregister(handles_send[i]);
+			starpu_data_unregister(handles_recv[i]);
+			free(vectors_send[i]);
+			free(vectors_recv[i]);
+		}
+	}
+
+	free(workers);
+	free(vectors_send);
+	free(vectors_recv);
+	free(handles_send);
+	free(handles_recv);
+	free(mpi_tags);
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}

+ 46 - 0
src/core/perfmodel/perfmodel.c

@@ -81,6 +81,20 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 }
 
 /*
+ * PER WORKER model
+ */
+
+static double per_worker_task_expected_perf(struct starpu_perfmodel *model, unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	double (*worker_cost_function)(struct starpu_task *task, unsigned workerid, unsigned nimpl);
+
+	worker_cost_function = model->worker_cost_function;
+	STARPU_ASSERT_MSG(worker_cost_function, "STARPU_PER_WORKER needs worker_cost_function to be defined");
+
+	return worker_cost_function(task, workerid, nimpl);
+}
+
+/*
  * PER ARCH model
  */
 
@@ -156,6 +170,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
 	switch (model->type)
 	{
+		case STARPU_PER_WORKER:
 		case STARPU_PER_ARCH:
 		case STARPU_COMMON:
 			/* Nothing more to do than init */
@@ -220,6 +235,20 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return exp_perf;
 }
 
+static double starpu_model_worker_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!model)
+		return 0.0;
+
+	if (model->type == STARPU_PER_WORKER)
+		return per_worker_task_expected_perf(model, workerid, task, nimpl);
+	else
+	{
+		struct starpu_perfmodel_arch *per_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
+		return starpu_model_expected_perf(task, model, per_arch, nimpl);
+	}
+}
+
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	if (!task->cl)
@@ -228,6 +257,14 @@ double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->model, workerid, sched_ctx_id, nimpl);
+}
+
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	if (!task->cl)
@@ -236,6 +273,15 @@ double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
 }
 
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->energy_model, workerid, sched_ctx_id, nimpl);
+
+}
+
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)

+ 5 - 2
src/sched_policies/component_sched.c

@@ -49,7 +49,6 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 	    workerid != -1;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
@@ -59,9 +58,13 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 				double d;
 				can_execute = 1;
 				if(bundle)
+				{
+					struct starpu_perfmodel_arch* archtype =
+						starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
+				}
 				else
-					d = starpu_task_expected_length(task, archtype, nimpl);
+					d = starpu_task_worker_expected_length(task, workerid, component->tree->sched_ctx_id, nimpl);
 				if(isnan(d))
 				{
 					*length = d;

+ 5 - 6
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -487,7 +487,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			}
 
 			double exp_end;
-			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
+			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
@@ -679,9 +679,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			}
 			else
 			{
-				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
-				local_energy[worker_ctx][nimpl] = starpu_task_expected_energy(task, perf_arch,nimpl);
+				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				if (conversion_time > 0.0)
 					local_task_length[worker_ctx][nimpl] += conversion_time;
@@ -1100,10 +1100,9 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
-	/* Compute the expected penality */
-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
 
-	double predicted = starpu_task_expected_length(task, perf_arch,
+	/* Compute the expected penality */
+	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
 						       starpu_task_get_implementation(task));
 
 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);

+ 11 - 1
tools/starpu_replay.c

@@ -1085,13 +1085,23 @@ int main(int argc, char **argv)
 		}
 		else if (TEST("Sizes"))
 		{
+			*ln = 0;
 			char *  buffer = s + 7;
 			const char * delim = " ";
-			char * token = strtok(buffer, delim);
+			unsigned nb_parameters_line = count_number_tokens(buffer, delim); 
 			unsigned k = 0;
 
+			if(nb_parameters == 0)
+			{
+				nb_parameters = nb_parameters_line; 
+				arrays_managing(set_alloc_mode(nb_parameters));
+			}
+			else
+				STARPU_ASSERT(nb_parameters == nb_parameters_line);
+
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
 
+			char * token = strtok(buffer, delim);
 			while (token != NULL && k < nb_parameters)
 			{
 				sizes_set[k] = strtol(token, NULL, 10);