Browse Source

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu

Denis Barthou 5 years ago
parent
commit
ce489e4de4

+ 2 - 0
.gitignore

@@ -7,6 +7,8 @@
 /build
 /build
 /build2
 /build2
 /build-aux
 /build-aux
+/build_starpu
+/install
 /GPATH
 /GPATH
 /GRTAGS
 /GRTAGS
 /GTAGS
 /GTAGS

+ 1 - 0
ChangeLog

@@ -29,6 +29,7 @@ New features:
   * New number_events.data trace file which monitors number of events in trace
   * New number_events.data trace file which monitors number of events in trace
     files. This file can be parsed by the new script
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
+  * New STARPU_PER_WORKER perfmodel.
 
 
 Small changes:
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
   * Use the S4U interface of Simgrid instead of xbt and MSG.

+ 167 - 249
configure.ac

@@ -92,6 +92,7 @@ if test x$enable_perf_debug = xyes; then
     enable_shared=no
     enable_shared=no
 fi
 fi
 default_enable_mpi_check=maybe
 default_enable_mpi_check=maybe
+default_enable_mpi=maybe
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
@@ -206,6 +207,9 @@ if test x$enable_simgrid = xyes ; then
         # want that by default
         # want that by default
 	default_enable_mpi_check=no
 	default_enable_mpi_check=no
 
 
+	# disable MPI support by default
+	default_enable_mpi=no
+
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	AC_LANG_PUSH([C++])
 	AC_LANG_PUSH([C++])
 	if test x$enable_shared = xno ; then
 	if test x$enable_shared = xno ; then
@@ -270,145 +274,146 @@ fi
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
-#                                    MPI                                      #
+#                                LIBTOOLS                                     #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
-                              [Disable StarPU MPI library generation])],
-            [enable_mpi=$enableval],
-            [enable_mpi=yes])
+#c++11 detection
+AX_CXX_COMPILE_STDCXX(11,noext,optional)
 
 
-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
-                              [Enable StarPU to run with the master-slave mode])],
-            use_mpi_master_slave=$enableval,
-            use_mpi_master_slave=no)
+AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
+if test $HAVE_CXX11 -eq 1; then
+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
+fi
 
 
-#Check MPICC
-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
-           [Path of the mpicc compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
-       elif test x$withval = xno ; then
-           mpi_requested=no
-	   mpicc_path=""
-	   use_mpi=no
-       else
-	   mpi_requested=yes
-           mpicc_path=$withval
-       fi
-   ],
-   [
-       mpi_requested=maybe
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICC=smpicc
-       else
-           DEFAULT_MPICC=mpicc
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
-   ])
+LT_PREREQ([2.2])
+LT_INIT([win32-dll])
 
 
-# in case MPI was explicitely required, but is not available, this is an error
-if test x$mpi_requested = xyes -a ! -x "$mpicc_path"; then
-   AC_MSG_ERROR([Compiler MPI not valid])
-fi
+AC_HEADER_STDC
 
 
-if test x$mpi_requested != xno ; then
-   # We test if the MPICC compiler exists
-     if test ! -x $mpicc_path; then
-         #MPICC does not exists or is not executable
-	 AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
-	 use_mpi=no
-     else
-	 use_mpi=yes
-	 if test x$enable_simgrid = xyes ; then
-             AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
-						  [Path of the smpirun helper])],
-			 [
-			     if test x$withval = xyes; then
-				 AC_MSG_ERROR(--with-smpirun must be given a pathname)
-			     else
-				 smpirun_path=$withval
-			     fi
-			 ],
-			 [
-			     # nothing was specified: default value is used
-			     AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
-			 ])
-	 fi
-     fi
+AC_C_RESTRICT
+
+# Check if bash is available
+AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
+
+# Record git version
+AC_PATH_PROG(gitcommand, git)
+if test "$gitcommand" = "" ; then
+   if test -f $srcdir/STARPU-REVISION ; then
+      cp $srcdir/STARPU-REVISION .
+   else
+      echo "unknown" > ./STARPU-REVISION
+   fi
+else
+   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
 fi
 fi
 
 
-AC_MSG_CHECKING(mpicc path)
+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
+
+###############################################################################
+#                                                                             #
+#                           MPI compilers                                     #
+#                                                                             #
+###############################################################################
+
+#Check MPICC
+if test x$enable_simgrid = xyes ; then
+    DEFAULT_MPICC=smpicc
+else
+    DEFAULT_MPICC=mpicc
+fi
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
+AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+AC_MSG_CHECKING(whether mpicc is available)
 AC_MSG_RESULT($mpicc_path)
 AC_MSG_RESULT($mpicc_path)
 AC_SUBST(MPICC, $mpicc_path)
 AC_SUBST(MPICC, $mpicc_path)
 
 
-
 #Check MPICXX/MPIC++
 #Check MPICXX/MPIC++
-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
-           [Path of the mpicxx/mpic++ compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
-       else
-           mpicxx_path=$withval
-       fi
-   ],
-   [
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICXX=smpicxx
-       else
-           DEFAULT_MPICXX=mpicxx
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+if test x$enable_simgrid = xyes ; then
+    DEFAULT_MPICXX=smpicxx
+else
+    DEFAULT_MPICXX=mpicxx
+fi
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
+AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
 
 
-       # try with mpic++ if mpicxx was not found
-       if test x$mpicxx_path = xno ; then
-            DEFAULT_MPICXX=mpic++
-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
-       fi
-   ])
+# try with mpic++ if mpicxx was not found
+if test x$mpicxx_path = xno ; then
+    DEFAULT_MPICXX=mpic++
+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+fi
 
 
 # We test if the MPICXX/MPIC++ compiler exists
 # We test if the MPICXX/MPIC++ compiler exists
 if test ! -x $mpicxx_path; then
 if test ! -x $mpicxx_path; then
-    #MPICXX/MPIC++ does not exists or is not executable
     AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
     AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
-    use_mpicxx=no
-else
-    use_mpicxx=yes
+    mpicxx_path=no
 fi
 fi
 
 
-AC_MSG_CHECKING(mpicxx/mpic++ path)
+AC_MSG_CHECKING(whether mpicxx is available)
 AC_MSG_RESULT($mpicxx_path)
 AC_MSG_RESULT($mpicxx_path)
 AC_SUBST(MPICXX, $mpicxx_path)
 AC_SUBST(MPICXX, $mpicxx_path)
 
 
+###############################################################################
+#                                                                             #
+#                                    MPI                                      #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
+                              [Disable StarPU MPI library generation])],
+            [enable_mpi=$enableval],
+            [enable_mpi=$default_enable_mpi])
 
 
-if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
-    cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-    cc_or_mpicc=$CC
+if test x$enable_mpi = xmaybe ; then
+    if test -x "$mpicc_path"; then
+	enable_mpi=yes
+    else
+	enable_mpi=no
+    fi
 fi
 fi
 
 
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+# in case MPI was explicitely required, but mpicc is not available, this is an error
+if test x$enable_mpi = xyes -a ! -x "$mpicc_path"; then
+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
+fi
 
 
-AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
-				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
-				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
-if test x$enable_mpi_pedantic_isend = xyes; then
-	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
+build_mpi_lib=$enable_mpi
+
+###############################################################################
+#                                                                             #
+#                                NEW MADELEINE                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
+		                    [Enable StarPU MPI library generation using the new madeleine backend])],
+            [enable_nmad=$enableval],
+            [enable_nmad=no])
+
+build_nmad_lib=no
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+#We can only build StarPU MPI Library if User wants it and MPI is available
+if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
+    build_nmad_lib=yes
+    build_mpi_lib=no
+    PKG_CHECK_MODULES([NMAD],[nmad])
+else
+    build_nmad_lib=no
 fi
 fi
 
 
-#We can only build MPI Master Slave if User wants it and MPI is available
-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
+###############################################################################
+#                                                                             #
+#                             MPI Master Slave                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
+                              [Enable StarPU to run with the master-slave mode])],
+              use_mpi_master_slave=$enableval,
+              use_mpi_master_slave=no)
+#We can only build MPI Master Slave if User wants it and MPI compiler are available
+if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
     build_mpi_master_slave=yes
     build_mpi_master_slave=yes
 else
 else
     build_mpi_master_slave=no
     build_mpi_master_slave=no
@@ -417,7 +422,9 @@ fi
 #users cannot use both at the same time
 #users cannot use both at the same time
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
     AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
     AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
-	enable_mpi=no
+    build_mpi_lib=no
+    build_nmad_lib=no
+    enable_mpi=no
 fi
 fi
 
 
 if test x$build_mpi_master_slave = xyes; then
 if test x$build_mpi_master_slave = xyes; then
@@ -449,95 +456,19 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 AC_MSG_RESULT($nmaxmpidev)
 AC_MSG_RESULT($nmaxmpidev)
 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
 
 
-
-###############################################################################
-#                                                                             #
-#                                NEW MADELEINE                                #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
-                              [Enable StarPU MPI library generation using new madeleine instead of mpi])],
-            [enable_nmad=$enableval],
-            [enable_nmad=no])
-
-if test x$use_mpi = xyes -a \( x$enable_nmad \) ; then
-    cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-    cc_or_mpicc=$CC
-fi
-
-build_nmad_lib=no
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
-#We can only build StarPU MPI Library if User wants it and MPI is available
-if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
-    build_nmad_lib=yes
-    enable_mpi=no
-    PKG_CHECK_MODULES([NMAD],[nmad])
-else
-    build_nmad_lib=no
-fi
-
-# in case NMAD was explicitely required, but the compiler MPI, this is an error
-if test x$enable_nmad = xyes -a ! -x "$mpicc_path"; then
-   AC_MSG_ERROR([Compiler MPI not valid])
-fi
-
-
-AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
-AC_MSG_RESULT($build_nmad_lib)
-
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
-#                                LIBTOOLS                                     #
+#                       Miscellaneous things for MPI                          #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
-#c++11 detection
-AX_CXX_COMPILE_STDCXX(11,noext,optional)
-
-AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
-AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
-if test $HAVE_CXX11 -eq 1; then
-  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
-fi
-
-LT_PREREQ([2.2])
-LT_INIT([win32-dll])
-
-AC_HEADER_STDC
-
-AC_C_RESTRICT
-
-# Check if bash is available
-AC_PATH_PROG([REALBASH], [bash], , [/bin:$PATH])
-
-# Record git version
-AC_PATH_PROG(gitcommand, git)
-if test "$gitcommand" = "" ; then
-   if test -f $srcdir/STARPU-REVISION ; then
-      cp $srcdir/STARPU-REVISION .
-   else
-      echo "unknown" > ./STARPU-REVISION
-   fi
-else
-   git log -n 1 --pretty="%H" $srcdir > ./STARPU-REVISION
+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
+				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
+if test x$enable_mpi_pedantic_isend = xyes; then
+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
 fi
 fi
 
 
-AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
-
-###############################################################################
-#                                                                             #
-#                       Miscellaneous things for MPI                          #
-#                                                                             #
-###############################################################################
-
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
 AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
 	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
 	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
@@ -551,68 +482,45 @@ fi
 if test x$enable_mpi_check = xno ; then
 if test x$enable_mpi_check = xno ; then
     running_mpi_check=no
     running_mpi_check=no
 fi
 fi
+if test x$enable_mpi = xno ; then
+    running_mpi_check=no
+fi
 
 
-
-if test x$enable_simgrid = xno ; then
+if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
     # Check if mpiexec is available
     # Check if mpiexec is available
-    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
-                [Path of mpiexec])],
-        [
-            if test x$withval = xyes; then
-                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
-            else
-                mpiexec_path=$withval
-            fi
-        ],
-        [
-            # nothing was specified: look in the path
-	    if test x$mpicc_path = x ; then
-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$PATH])
-	    else
-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
-	    fi
-        ])
-
+    if test x$enable_simgrid = xyes ; then
+	DEFAULT_MPIEXEC=smpirun
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
+	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
+    else
+	DEFAULT_MPIEXEC=mpiexec
+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
+	if test x$mpicc_path = x ; then
+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
+	else
+	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
+	fi
+    fi
     AC_MSG_CHECKING(whether mpiexec is available)
     AC_MSG_CHECKING(whether mpiexec is available)
     AC_MSG_RESULT($mpiexec_path)
     AC_MSG_RESULT($mpiexec_path)
 
 
     # We test if MPIEXEC exists
     # We test if MPIEXEC exists
     if test ! -x $mpiexec_path; then
     if test ! -x $mpiexec_path; then
-        # if it's not valid, it could be the parameter given to configure.ac was not a full path, let's look for it
-	if test x$mpicc_path = x ; then
-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$PATH])
-	else
-            AC_PATH_PROG(mpiexec_path_bis, $mpiexec_path, [no], [$(dirname $mpicc_path):$PATH])
-	fi
-        AC_MSG_CHECKING(whether mpiexec is available (2nd try))
-        AC_MSG_RESULT($mpiexec_path_bis)
-	if test -x $mpiexec_path_bis; then
-	   mpiexec_path=$mpiexec_path_bis
-	else
-           #MPIEXEC does not exists or is not executable
-           AC_MSG_RESULT(The mpiexec script is not valid)
-           running_mpi_check=no
-           mpiexec_path=""
-	fi
+        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
+        running_mpi_check=no
+        mpiexec_path=""
     fi
     fi
     AC_SUBST(MPIEXEC,$mpiexec_path)
     AC_SUBST(MPIEXEC,$mpiexec_path)
 fi
 fi
 
 
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$use_mpi = xyes ; then
-    AC_MSG_CHECKING(whether MPI tests should be run)
-    AC_MSG_RESULT($running_mpi_check)
-fi
-
-#We can only build StarPU MPI Library if User wants it and MPI is available
-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
-    build_mpi_lib=yes
-else
-    build_mpi_lib=no
-fi
+AC_MSG_CHECKING(whether MPI tests should be run)
+AC_MSG_RESULT($running_mpi_check)
 
 
 AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
 AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
 AC_MSG_RESULT($build_mpi_lib)
 AC_MSG_RESULT($build_mpi_lib)
+AC_MSG_CHECKING(whether the StarPU MPI nmad library should be generated)
+AC_MSG_RESULT($build_nmad_lib)
 
 
 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes)
 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes)
 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
 if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
@@ -622,11 +530,9 @@ if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
 	else
 	else
 		AC_DEFINE(STARPU_USE_MPI_NMAD,[1],[whether the StarPU MPI library (with a NewMadeleine implementation) is available])
 		AC_DEFINE(STARPU_USE_MPI_NMAD,[1],[whether the StarPU MPI library (with a NewMadeleine implementation) is available])
 	fi
 	fi
-else
-	running_mpi_check=no
 fi
 fi
 
 
-if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
+if test x$enable_mpi = xyes ; then
     if test x$enable_simgrid = xyes ; then
     if test x$enable_simgrid = xyes ; then
         if test x$enable_shared = xyes ; then
         if test x$enable_shared = xyes ; then
 	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
 	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, if you need the MPI support, theb use --disable-shared to fix this, else disable MPI with --disable-mpi])
@@ -644,17 +550,16 @@ AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
 
 
 AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
 AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
-			[Arguments for mpiexec])],
-	[
+					  [Arguments for mpiexec])],
+	    [
 		mpiexec_args=$withval
 		mpiexec_args=$withval
-	])
+	    ])
 AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
 AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
 
 
-
 AC_MSG_CHECKING(whether MPI debug messages should be displayed)
 AC_MSG_CHECKING(whether MPI debug messages should be displayed)
 AC_ARG_ENABLE(mpi-verbose, [AS_HELP_STRING([--enable-mpi-verbose],
 AC_ARG_ENABLE(mpi-verbose, [AS_HELP_STRING([--enable-mpi-verbose],
-			[display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
-			enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
+					   [display MPI verbose debug messages (--enable-mpi-verbose=extra increase the verbosity)])],
+	      enable_mpi_verbose=$enableval, enable_mpi_verbose=no)
 AC_MSG_RESULT($enable_mpi_verbose)
 AC_MSG_RESULT($enable_mpi_verbose)
 if test x$enable_mpi_verbose = xyes; then
 if test x$enable_mpi_verbose = xyes; then
 	AC_DEFINE(STARPU_MPI_VERBOSE, [1], [display MPI verbose debug messages])
 	AC_DEFINE(STARPU_MPI_VERBOSE, [1], [display MPI verbose debug messages])
@@ -664,6 +569,19 @@ if test x$enable_mpi_verbose = xextra; then
 	AC_DEFINE(STARPU_MPI_EXTRA_VERBOSE, [1], [display MPI verbose debug messages])
 	AC_DEFINE(STARPU_MPI_EXTRA_VERBOSE, [1], [display MPI verbose debug messages])
 fi
 fi
 
 
+if test x$enable_mpi = xyes -o x$build_mpi_master_slave = xyes ; then
+    cc_or_mpicc=$mpicc_path
+    # For some reason, libtool uses gcc instead of mpicc when linking
+    # libstarpumpi.
+    # On Darwin (and maybe other systems ?) the linker will fail (undefined
+    # references to MPI_*). We manually add the required flags to fix this
+    # issue.
+    AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+else
+    cc_or_mpicc=$CC
+fi
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                           MIC device compilation                            #
 #                           MIC device compilation                            #

+ 1 - 1
doc/doxygen/chapters/310_data_management.doxy

@@ -602,7 +602,7 @@ whole machine, it would not be efficient to accumulate them in only one place,
 incurring data transmission each and access concurrency.
 incurring data transmission each and access concurrency.
 
 
 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
-this case: it will allocate a buffer on each memory node, and accumulate
+this case: it will allocate a buffer on each worker (lazily), and accumulate
 intermediate results there. When the data is eventually accessed in the normal
 intermediate results there. When the data is eventually accessed in the normal
 mode ::STARPU_R, StarPU will collect the intermediate results in just one
 mode ::STARPU_R, StarPU will collect the intermediate results in just one
 buffer.
 buffer.

+ 4 - 3
doc/doxygen/chapters/320_scheduling.doxy

@@ -190,9 +190,10 @@ single task gives the consumption of the task in Joules, which can be given to
 starpu_perfmodel_update_history().
 starpu_perfmodel_update_history().
 
 
 Another way to provide the energy performance is to define a
 Another way to provide the energy performance is to define a
-perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
-starpu_perfmodel::arch_cost_function field to a function which shall return the
-estimated consumption of the task in Joules. Such a function can for instance
+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH or
+::STARPU_PER_WORKER , and set the starpu_perfmodel::arch_cost_function or
+starpu_perfmodel::worker_cost_function field to a function which shall return
+the estimated consumption of the task in Joules. Such a function can for instance
 use starpu_task_expected_length() on the task (in µs), multiplied by the
 use starpu_task_expected_length() on the task (in µs), multiplied by the
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 typical power consumption of the device, e.g. in W, and divided by 1000000. to
 get Joules.
 get Joules.

+ 2 - 1
doc/doxygen/chapters/350_scheduling_policy_definition.doxy

@@ -45,7 +45,8 @@ provides a complete list of the functions available for writing advanced schedul
 This includes getting an estimation for a task computation completion with
 This includes getting an estimation for a task computation completion with
 starpu_task_expected_length(), for the required data transfers with
 starpu_task_expected_length(), for the required data transfers with
 starpu_task_expected_data_transfer_time_for(), for the required energy with
 starpu_task_expected_data_transfer_time_for(), for the required energy with
-starpu_task_expected_energy(), etc. Other
+starpu_task_expected_energy(), etc. Per-worker variants are also available with
+starpu_task_worker_expected_length(), etc. Other
 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
 useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
 starpu_transfer_predict(), ...
 starpu_transfer_predict(), ...
 One can also directly test the presence of a data handle with starpu_data_is_on_node().
 One can also directly test the presence of a data handle with starpu_data_is_on_node().

+ 5 - 0
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -401,6 +401,11 @@ filled with pointers to functions which return the expected duration
 of the task in micro-seconds, one per architecture, see for instance
 of the task in micro-seconds, one per architecture, see for instance
 <c>tests/datawizard/locality.c</c>
 <c>tests/datawizard/locality.c</c>
 </li>
 </li>
+
+<li>
+Provided explicitly by the application (model type ::STARPU_PER_WORKER)
+similarly with the starpu_perfmodel::worker_cost_function field.
+</li>
 </ul>
 </ul>
 
 
 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
 For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and

+ 8 - 1
include/starpu_perfmodel.h

@@ -165,6 +165,7 @@ struct starpu_perfmodel_per_arch
 enum starpu_perfmodel_type
 enum starpu_perfmodel_type
 {
 {
         STARPU_PERFMODEL_INVALID=0,
         STARPU_PERFMODEL_INVALID=0,
+	STARPU_PER_WORKER,                /**< Application-provided per-worker cost model function */
 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
 	STARPU_PER_ARCH,                  /**< Application-provided per-arch cost model function */
 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
 	STARPU_COMMON,                    /**< Application-provided common cost model function, with per-arch factor */
 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
 	STARPU_HISTORY_BASED,             /**< Automatic history-based cost model */
@@ -226,11 +227,17 @@ struct starpu_perfmodel
 	*/
 	*/
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 	/**
 	/**
-	   Used by ::STARPU_COMMON. Take a task, an arch and implementation
+	   Used by ::STARPU_PER_ARCH. Take a task, an arch and implementation
 	   number, and must return a task duration estimation in
 	   number, and must return a task duration estimation in
 	   micro-seconds on that arch.
 	   micro-seconds on that arch.
 	*/
 	*/
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
 	double (*arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch * arch, unsigned nimpl);
+	/**
+	   Used by ::STARPU_PER_WORKER. Take a task, a worker id and implementation
+	   number, and must return a task duration estimation in
+	   micro-seconds on that worker.
+	*/
+	double (*worker_cost_function)(struct starpu_task *, unsigned workerid, unsigned nimpl);
 
 
 	/**
 	/**
 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
 	   Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and

+ 14 - 0
include/starpu_scheduler.h

@@ -110,6 +110,10 @@ struct starpu_sched_policy
 	   to be executed by the worker. This method therefore permits
 	   to be executed by the worker. This method therefore permits
 	   to keep the state of the scheduler coherent even when
 	   to keep the state of the scheduler coherent even when
 	   StarPU bypasses the scheduling strategy.
 	   StarPU bypasses the scheduling strategy.
+
+	   Note: to get an estimation of the task duration, \p perf_workerid
+	   needs to be used rather than \p workerid, for the case of parallel
+	   tasks.
 	*/
 	*/
 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 
 
@@ -366,6 +370,11 @@ uint32_t starpu_task_data_footprint(struct starpu_task *task);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 
 /**
 /**
+   Same as starpu_task_expected_length() but for a precise worker.
+*/
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return an estimated speedup factor relative to CPU speed
    Return an estimated speedup factor relative to CPU speed
 */
 */
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
@@ -395,6 +404,11 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
 
 /**
 /**
+   Same as starpu_task_expected_energy but for a precise worker
+*/
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl);
+
+/**
    Return expected conversion time in ms (multiformat interface only)
    Return expected conversion time in ms (multiformat interface only)
 */
 */
 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);

+ 3 - 3
mpi/include/starpu_mpi.h

@@ -50,9 +50,9 @@ extern "C"
 int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
 int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
 
 
 /**
 /**
-   Same as starpu_mpi_init_conf(), except that this does not
-   initialize the StarPU library. The caller thus has to call
-   starpu_init() before this.
+   Same as starpu_mpi_init_conf(), except that this does not initialize the
+   StarPU library. The caller thus has to call starpu_init() before this, and it
+   can not reserve a core for the MPI communications.
 */
 */
 int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
 int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
 
 

+ 28 - 2
mpi/tests/Makefile.am

@@ -137,7 +137,13 @@ starpu_mpi_TESTS +=				\
 	temporary				\
 	temporary				\
 	user_defined_datatype			\
 	user_defined_datatype			\
 	early_stuff				\
 	early_stuff				\
-	sendrecv_bench
+	sendrecv_bench				\
+	sendrecv_parallel_tasks_bench
+
+if !NO_BLAS_LIB
+starpu_mpi_TESTS +=				\
+	sendrecv_gemm_bench
+endif
 
 
 if !STARPU_SIMGRID
 if !STARPU_SIMGRID
 # missing support in simgrid
 # missing support in simgrid
@@ -226,7 +232,9 @@ noinst_PROGRAMS =				\
 	starpu_redefine				\
 	starpu_redefine				\
 	load_balancer				\
 	load_balancer				\
 	driver					\
 	driver					\
-	sendrecv_bench
+	sendrecv_bench				\
+	sendrecv_gemm_bench			\
+	sendrecv_parallel_tasks_bench
 
 
 XFAIL_TESTS=					\
 XFAIL_TESTS=					\
 	policy_register_toomany			\
 	policy_register_toomany			\
@@ -256,4 +264,22 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
+
+sendrecv_bench_SOURCES = sendrecv_bench.c
+sendrecv_bench_SOURCES += bench_helper.c
+sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
+
+sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
+sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
+sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
+
+if !NO_BLAS_LIB
+sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
+sendrecv_gemm_bench_SOURCES += bench_helper.c
+sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
+sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
+
+sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
+endif
+
 endif
 endif

+ 136 - 0
mpi/tests/abstract_sendrecv_bench.c

@@ -0,0 +1,136 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "bench_helper.h"
+#include "abstract_sendrecv_bench.h"
+
+
+
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
+{
+	uint64_t iterations = LOOPS_DEFAULT;
+
+	if (mpi_rank >= 2)
+	{
+		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+		{
+			iterations = bench_nb_iterations(iterations, s);
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+
+			for (uint64_t j = 0; j < iterations; j++)
+			{
+				starpu_mpi_barrier(MPI_COMM_WORLD);
+			}
+		}
+
+		return;
+	}
+
+	if (mpi_rank == 0)
+	{
+		printf("Times in us\n");
+		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
+	}
+
+	int array_size = 0;
+	starpu_data_handle_t handle_send, handle_recv;
+	float* vector_send = NULL;
+	float* vector_recv = NULL;
+	double t1, t2, global_tstart, global_tend;
+	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
+
+	if (thread_barrier != NULL)
+	{
+		STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
+	}
+
+	global_tstart = starpu_timing_now();
+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	{
+		vector_send = malloc(s);
+		vector_recv = malloc(s);
+		memset(vector_send, 0, s);
+		memset(vector_recv, 0, s);
+
+		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
+		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
+
+		iterations = bench_nb_iterations(iterations, s);
+
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+
+		for (uint64_t j = 0; j < iterations; j++)
+		{
+			if (mpi_rank == 0)
+			{
+				t1 = starpu_timing_now();
+				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				t2 = starpu_timing_now();
+
+				const double t = (t2 -t1) / 2;
+
+				lats[j] = t;
+			}
+			else
+			{
+				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+			}
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+		}
+
+		if (mpi_rank == 0)
+		{
+			qsort(lats, iterations, sizeof(double), &comp_double);
+
+			const double min_lat = lats[0];
+			const double max_lat = lats[iterations - 1];
+			const double med_lat = lats[(iterations - 1) / 2];
+			const double d1_lat = lats[(iterations - 1) / 10];
+			const double d9_lat = lats[9 * (iterations - 1) / 10];
+			double avg_lat = 0.0;
+
+			for(uint64_t k = 0; k < iterations; k++)
+			{
+				avg_lat += lats[k];
+			}
+
+			avg_lat /= iterations;
+			const double bw_million_byte = s / min_lat;
+			const double bw_mbyte        = bw_million_byte / 1.048576;
+
+			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
+				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
+			fflush(stdout);
+		}
+		starpu_data_unregister(handle_recv);
+		starpu_data_unregister(handle_send);
+
+		free(vector_send);
+		free(vector_recv);
+	}
+	global_tend = starpu_timing_now();
+
+	if (mpi_rank == 0)
+	{
+		printf("Comm bench took %9.3lf ms\n", (global_tend - global_tstart) / 1000);
+	}
+
+	free(lats);
+}

+ 21 - 0
mpi/tests/abstract_sendrecv_bench.h

@@ -0,0 +1,21 @@
+
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);

+ 62 - 0
mpi/tests/bench_helper.c

@@ -0,0 +1,62 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "bench_helper.h"
+
+
+int comp_double(const void*_a, const void*_b)
+{
+	const double* a = _a;
+	const double* b = _b;
+
+	if(*a < *b)
+		return -1;
+	else if(*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+
+uint64_t bench_next_size(uint64_t len)
+{
+	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
+
+	if(next <= len)
+		next++;
+
+	return next;
+}
+
+
+uint64_t bench_nb_iterations(int iterations, uint64_t len)
+{
+	const uint64_t max_data = NX_MAX;
+
+	if(len <= 0)
+		len = 1;
+
+	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
+
+	if(data_size  > max_data)
+	{
+		iterations = (max_data / (uint64_t)len);
+		if(iterations < 2)
+			iterations = 2;
+	}
+
+	return iterations;
+}

+ 37 - 0
mpi/tests/bench_helper.h

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NX_MAX (512 * 1024 * 1024) // kB
+#define NX_MIN 0
+#ifdef STARPU_QUICK_CHECK
+#define MULT_DEFAULT 4
+#else
+#define MULT_DEFAULT 2
+#endif
+#define INCR_DEFAULT 0
+#ifdef STARPU_QUICK_CHECK
+#define LOOPS_DEFAULT 100
+#else
+#define LOOPS_DEFAULT 100000
+#endif
+
+int comp_double(const void*_a, const void*_b);
+uint64_t bench_next_size(uint64_t len);
+uint64_t bench_nb_iterations(int iterations, uint64_t len);

+ 6 - 168
mpi/tests/sendrecv_bench.c

@@ -18,84 +18,15 @@
  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
  */
  */
 
 
-#include <math.h>
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "helper.h"
+#include "abstract_sendrecv_bench.h"
 
 
-#define NX_MAX (512 * 1024 * 1024) // kB
-#define NX_MIN 0
-#ifdef STARPU_QUICK_CHECK
-#define MULT_DEFAULT 4
-#else
-#define MULT_DEFAULT 2
-#endif
-#define INCR_DEFAULT 0
-#define NX_STEP 1.4 // multiplication
-#ifdef STARPU_QUICK_CHECK
-#define LOOPS_DEFAULT 100
-#else
-#define LOOPS_DEFAULT 10000
-#endif
-
-int times_nb_nodes;
-int times_size;
-int worldsize;
-
-static int comp_double(const void*_a, const void*_b)
-{
-	const double* a = _a;
-	const double* b = _b;
-
-	if(*a < *b)
-		return -1;
-	else if(*a > *b)
-		return 1;
-	else
-		return 0;
-}
-
-static inline uint64_t _next(uint64_t len, double multiplier, uint64_t increment)
-{
-	uint64_t next = len * multiplier + increment;
-
-	if(next <= len)
-		next++;
-
-	return next;
-}
-
-
-static inline uint64_t _iterations(int iterations, uint64_t len)
-{
-	const uint64_t max_data = 512 * 1024 * 1024;
-
-	if(len <= 0)
-		len = 1;
-
-	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
-
-	if(data_size  > max_data)
-	{
-		iterations = (max_data / (uint64_t)len);
-		if(iterations < 2)
-			iterations = 2;
-	}
-
-	return iterations;
-}
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	int ret, rank;
-	starpu_data_handle_t handle_send, handle_recv;
+	int ret, rank, worldsize;
 	int mpi_init;
 	int mpi_init;
-	float* vector_send = NULL;
-	float* vector_recv = NULL;
-	double t1, t2;
-	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
-	uint64_t iterations = LOOPS_DEFAULT;
-	double multiplier = MULT_DEFAULT;
-	uint64_t increment = INCR_DEFAULT;
 
 
 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
@@ -115,108 +46,15 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
-	if (rank >= 2)
-	{
-		starpu_pause();
-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
-		{
-			iterations = _iterations(iterations, s);
-
-			starpu_mpi_barrier(MPI_COMM_WORLD);
-
-			for (uint64_t j = 0; j < iterations; j++)
-			{
-				starpu_mpi_barrier(MPI_COMM_WORLD);
-			}
-		}
-		starpu_resume();
-
-		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
-		return 0;
-	}
-
-	if (rank == 0)
-	{
-		printf("Times in us\n");
-		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
-	}
-
-	int array_size = 0;
-
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
-	{
-		vector_send = malloc(s);
-		vector_recv = malloc(s);
-		memset(vector_send, 0, s);
-		memset(vector_recv, 0, s);
-
-		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
-		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
-
-		iterations = _iterations(iterations, s);
+	/* Pause workers for this bench: all workers polling for tasks has a strong impact on performances */
+	starpu_pause();
 
 
-		starpu_mpi_barrier(MPI_COMM_WORLD);
-
-		for (uint64_t j = 0; j < iterations; j++)
-		{
-			if (rank == 0)
-			{
-				t1 = starpu_timing_now();
-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
-				t2 = starpu_timing_now();
-
-				const double delay = t2 - t1;
-				const double t = delay / 2;
-
-				lats[j] = t;
-			}
-			else
-			{
-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
-			}
-
-			starpu_mpi_barrier(MPI_COMM_WORLD);
-		}
-
-		if (rank == 0)
-		{
-			qsort(lats, iterations, sizeof(double), &comp_double);
-
-			const double min_lat = lats[0];
-			const double max_lat = lats[iterations - 1];
-			const double med_lat = lats[(iterations - 1) / 2];
-			const double d1_lat = lats[(iterations - 1) / 10];
-			const double d9_lat = lats[9 * (iterations - 1) / 10];
-			double avg_lat = 0.0;
-
-			for(uint64_t k = 0; k < iterations; k++)
-			{
-				avg_lat += lats[k];
-			}
-
-			avg_lat /= iterations;
-			const double bw_million_byte = s / min_lat;
-			const double bw_mbyte        = bw_million_byte / 1.048576;
-
-			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
-				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
-			fflush(stdout);
-		}
-		starpu_data_unregister(handle_recv);
-		starpu_data_unregister(handle_send);
-
-		free(vector_send);
-		free(vector_recv);
-	}
+	sendrecv_bench(rank, NULL);
 
 
+	starpu_resume();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	if (!mpi_init)
 	if (!mpi_init)
 		MPI_Finalize();
 		MPI_Finalize();
 
 
-	free(lats);
 	return 0;
 	return 0;
 }
 }

+ 463 - 0
mpi/tests/sendrecv_gemm_bench.c

@@ -0,0 +1,463 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Simple *not distributed* parallel GEMM implementation and sendrecv bench at the same time.
+ *
+ * This bench is a merge of mpi/tests/sendrecv_bench and examples/mult/sgemm
+ *
+ * A *non-distributed* GEMM is computed on each node, while a sendrecv bench is running,
+ * completely independently. The goal is to measure the impact of worker computations on
+ * communications.
+ *
+ * Use the -nblocks parameter to define the matrix size (matrix size = nblocks * 320), such as
+ * the GEMM finishes after the sendrecv bench.
+ */
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <starpu_mpi.h>
+#include <starpu_fxt.h>
+
+#include <common/blas.h>
+
+#include "helper.h"
+#include "abstract_sendrecv_bench.h"
+#include "../../examples/mult/simple.h"
+
+#define CHECK_TASK_SUBMIT(ret) do {				\
+	if (ret == -ENODEV)					\
+	{							\
+		ret = 77;					\
+		goto enodev;					\
+	}							\
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");	\
+} while(0)
+
+static int mpi_rank;
+static int comm_thread_cpuid = -1;
+static unsigned nslices = 4;
+#if defined(STARPU_QUICK_CHECK) && !defined(STARPU_SIMGRID)
+static unsigned matrix_dim = 256;
+#else
+static unsigned matrix_dim = 320 * 4;
+#endif
+static unsigned check = 0;
+
+static TYPE *A, *B, *C;
+static starpu_data_handle_t A_handle, B_handle, C_handle;
+
+static starpu_pthread_barrier_t thread_barrier;
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define PRINTF(fmt, ...) do { if (!getenv("STARPU_SSILENT")) {printf(fmt, ## __VA_ARGS__); fflush(stdout); }} while(0)
+
+static void check_output(void)
+{
+	/* compute C = C - AB */
+	CPU_GEMM("N", "N", matrix_dim, matrix_dim, matrix_dim, (TYPE)-1.0f, A, matrix_dim, B, matrix_dim, (TYPE)1.0f, C, matrix_dim);
+
+	/* make sure C = 0 */
+	TYPE err;
+	err = CPU_ASUM(matrix_dim*matrix_dim, C, 1);
+
+	if (err < matrix_dim*matrix_dim*0.001)
+	{
+		FPRINTF(stderr, "Results are OK\n");
+	}
+	else
+	{
+		int max;
+		max = CPU_IAMAX(matrix_dim*matrix_dim, C, 1);
+
+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
+		FPRINTF(stderr, "Max error : %e\n", C[max]);
+	}
+}
+
+static void init_problem_data(void)
+{
+#ifndef STARPU_SIMGRID
+	unsigned i,j;
+#endif
+
+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+
+#ifndef STARPU_SIMGRID
+	/* fill the matrices */
+	for (j=0; j < matrix_dim; j++)
+	{
+		for (i=0; i < matrix_dim; i++)
+		{
+			A[j+i*matrix_dim] = (TYPE)(starpu_drand48());
+			B[j+i*matrix_dim] = (TYPE)(starpu_drand48());
+			C[j+i*matrix_dim] = (TYPE)(0);
+		}
+	}
+#endif
+}
+
+static void partition_mult_data(void)
+{
+	starpu_matrix_data_register(&A_handle, STARPU_MAIN_RAM, (uintptr_t)A,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&B_handle, STARPU_MAIN_RAM, (uintptr_t)B,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+	starpu_matrix_data_register(&C_handle, STARPU_MAIN_RAM, (uintptr_t)C,
+		matrix_dim, matrix_dim, matrix_dim, sizeof(TYPE));
+
+	struct starpu_data_filter vert;
+	memset(&vert, 0, sizeof(vert));
+	vert.filter_func = starpu_matrix_filter_vertical_block;
+	vert.nchildren = nslices;
+
+	struct starpu_data_filter horiz;
+	memset(&horiz, 0, sizeof(horiz));
+	horiz.filter_func = starpu_matrix_filter_block;
+	horiz.nchildren = nslices;
+
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
+
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
+}
+
+
+void cpu_init_matrix_random(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (starpu_drand48());
+		subB[i] = (TYPE) (starpu_drand48());
+	}
+}
+
+
+void cpu_init_matrix_zero(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+
+	for (unsigned i = 0; i < nx *ny; i++)
+	{
+		subA[i] = (TYPE) (0);
+	}
+}
+
+
+void cpu_mult(void *descr[], void *arg)
+{
+	(void)arg;
+	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *subC = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned nxC = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned nyC = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned nyA = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ldA = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
+
+	int worker_size = starpu_combined_worker_get_size();
+
+	if (worker_size == 1)
+	{
+		/* Sequential CPU task */
+		CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB, (TYPE)0.0, subC, ldC);
+	}
+	else
+	{
+		/* Parallel CPU task */
+		unsigned rank = starpu_combined_worker_get_rank();
+
+		unsigned block_size = (nyC + worker_size - 1)/worker_size;
+		unsigned new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
+
+		STARPU_ASSERT(nyC == STARPU_MATRIX_GET_NY(descr[1]));
+
+		TYPE *new_subB = &subB[block_size*rank];
+		TYPE *new_subC = &subC[block_size*rank];
+
+		CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
+	}
+}
+
+static struct starpu_perfmodel starpu_gemm_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = STARPU_GEMM_STR(gemm)
+};
+
+static struct starpu_codelet cl =
+{
+	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_mult},
+	.cpu_funcs_name = {"cpu_mult"},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &starpu_gemm_model
+};
+
+static struct starpu_codelet cl_init_matrix_random =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_random},
+	.cpu_funcs_name = {"cpu_init_matrix_random"},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet cl_init_matrix_zero =
+{
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {cpu_init_matrix_zero},
+	.cpu_funcs_name = {"cpu_init_matrix_zero"},
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			char *argptr;
+			nslices = strtol(argv[++i], &argptr, 10);
+			matrix_dim = 320 * nslices;
+		}
+
+		else if (strcmp(argv[i], "-size") == 0)
+		{
+			char *argptr;
+			unsigned matrix_dim_tmp = strtol(argv[++i], &argptr, 10);
+			if (matrix_dim_tmp % 320 != 0)
+			{
+				fprintf(stderr, "Matrix size has to be a multiple of 320\n");
+			}
+			else
+			{
+				matrix_dim = matrix_dim_tmp;
+				nslices = matrix_dim / 320;
+			}
+		}
+
+		else if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		else if (strcmp(argv[i], "-spmd") == 0)
+		{
+			cl.type = STARPU_SPMD;
+		}
+
+		else if (strcmp(argv[i], "-comm-thread-cpuid") == 0)
+		{
+			comm_thread_cpuid = atoi(argv[++i]);
+		}
+
+		else if (strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
+		{
+			fprintf(stderr,"Usage: %s [-nblocks n] [-size size] [-check] [-spmd] [-comm_thread_cpuid cpuid]\n", argv[0]);
+			fprintf(stderr,"Currently selected: matrix size: %u - %u blocks\n", matrix_dim, nslices);
+			fprintf(stderr, "Use -comm_thread_cpuid to specifiy where to bind the comm benchmarking thread\n");
+			exit(EXIT_SUCCESS);
+		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s", argv[i]);
+			exit(EXIT_FAILURE);
+		}
+	}
+}
+
+
+static void* comm_thread_func(void* arg)
+{
+	if (comm_thread_cpuid < 0)
+	{
+		comm_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
+	}
+
+	if (starpu_bind_thread_on(comm_thread_cpuid, 0, "Comm") < 0)
+	{
+		char hostname[65];
+		gethostname(hostname, sizeof(hostname));
+		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
+	}
+
+	sendrecv_bench(mpi_rank, &thread_barrier);
+
+	return NULL;
+}
+
+
+int main(int argc, char **argv)
+{
+	double start, end;
+	int ret, mpi_init, worldsize;
+	starpu_pthread_t comm_thread;
+
+	char hostname[255];
+	gethostname(hostname, 255);
+
+	parse_args(argc, argv);
+
+	starpu_fxt_autostart_profiling(0);
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	if (worldsize < 2)
+	{
+		if (mpi_rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+
+	STARPU_PTHREAD_BARRIER_INIT(&thread_barrier, NULL, 2);
+
+
+	// Start comm thread, benchmarking sendrecv:
+	STARPU_PTHREAD_CREATE(&comm_thread, NULL, comm_thread_func, NULL);
+
+
+	// Main thread will submit GEMM tasks:
+	starpu_malloc_flags((void **)&A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_malloc_flags((void **)&C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	partition_mult_data();
+
+
+	if (mpi_rank == 0)
+	{
+		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
+	}
+
+	starpu_pause();
+
+	unsigned x, y;
+#ifndef STARPU_SIMGRID
+	// Initialize matrices:
+	for (x = 0; x < nslices; x++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl_init_matrix_random;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, x);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+
+		for (y = 0; y < nslices; y++)
+		{
+			task = starpu_task_create();
+			task->cl = &cl_init_matrix_zero;
+			task->handles[0] = starpu_data_get_sub_data(C_handle, 2, x, y);
+			ret = starpu_task_submit(task);
+			CHECK_TASK_SUBMIT(ret);
+		}
+	}
+#endif
+
+	for (x = 0; x < nslices; x++)
+	for (y = 0; y < nslices; y++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl;
+		task->handles[0] = starpu_data_get_sub_data(A_handle, 1, y);
+		task->handles[1] = starpu_data_get_sub_data(B_handle, 1, x);
+		task->handles[2] = starpu_data_get_sub_data(C_handle, 2, x, y);
+		task->flops = 2ULL * (matrix_dim/nslices) * (matrix_dim/nslices) * matrix_dim;
+
+		ret = starpu_task_submit(task);
+		CHECK_TASK_SUBMIT(ret);
+		starpu_data_wont_use(starpu_data_get_sub_data(C_handle, 2, x, y));
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	starpu_fxt_start_profiling();
+
+	STARPU_PTHREAD_BARRIER_WAIT(&thread_barrier);
+
+	start = starpu_timing_now();
+	starpu_resume();
+	starpu_task_wait_for_all();
+	end = starpu_timing_now();
+	starpu_pause(); // Pause not to disturb comm thread if it isn't done
+
+	double timing = end - start;
+	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
+
+	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
+
+
+enodev:
+	starpu_data_unpartition(C_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(B_handle, STARPU_MAIN_RAM);
+	starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+
+	if (check)
+		check_output();
+
+	starpu_free_flags(A, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(B, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+	starpu_free_flags(C, matrix_dim*matrix_dim*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+
+
+	// Wait comm thread:
+	STARPU_PTHREAD_JOIN(comm_thread, NULL);
+	STARPU_PTHREAD_BARRIER_DESTROY(&thread_barrier);
+
+	starpu_fxt_stop_profiling();
+
+	starpu_resume();
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return ret;
+}

+ 215 - 0
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -0,0 +1,215 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * sendrecv benchmark from different tasks, executed simultaneously on serveral
+ * workers.
+ * Inspired a lot from NewMadeleine examples/piom/nm_piom_pingpong.c
+ *
+ * The goal is to measure impact of calls to starpu_mpi_* from different threads.
+ *
+ * Use STARPU_NCPU to set the number of parallel ping pongs
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+#include "bench_helper.h"
+#include "abstract_sendrecv_bench.h"
+
+#define NB_WARMUP_PINGPONGS 10
+
+/* We reduce NX_MAX, since some NICs don't support exchanging simultaneously such amount of memory */
+#undef NX_MAX
+#define NX_MAX (64 * 1024 * 1024)
+
+
+void cpu_task(void* descr[], void* args)
+{
+	int mpi_rank;
+	uint64_t iterations = LOOPS_DEFAULT / 100;
+	uint64_t s;
+	starpu_data_handle_t handle_send, handle_recv;
+	double t1, t2;
+	int asked_worker;
+	int current_worker = starpu_worker_get_id();
+
+	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
+
+	STARPU_ASSERT(asked_worker == current_worker);
+
+	iterations = bench_nb_iterations(iterations, s);
+	double* lats = malloc(sizeof(double) * iterations);
+
+	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
+	{
+		if (mpi_rank == 0)
+		{
+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+		}
+		else
+		{
+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+		}
+	}
+
+	for (uint64_t j = 0; j < iterations; j++)
+	{
+		if (mpi_rank == 0)
+		{
+			t1 = starpu_timing_now();
+			starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+			starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+			t2 = starpu_timing_now();
+
+			lats[j] =  (t2 - t1) / 2;
+		}
+		else
+		{
+			starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+			starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+		}
+	}
+
+	if (mpi_rank == 0)
+	{
+		qsort(lats, iterations, sizeof(double), &comp_double);
+
+		const double min_lat = lats[0];
+		const double max_lat = lats[iterations - 1];
+		const double med_lat = lats[(iterations - 1) / 2];
+		const double d1_lat = lats[(iterations - 1) / 10];
+		const double d9_lat = lats[9 * (iterations - 1) / 10];
+		double avg_lat = 0.0;
+
+		for(uint64_t k = 0; k < iterations; k++)
+		{
+			avg_lat += lats[k];
+		}
+
+		avg_lat /= iterations;
+		const double bw_million_byte = s / min_lat;
+		const double bw_mbyte        = bw_million_byte / 1.048576;
+
+		printf("%2d\t\t%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
+			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
+		fflush(stdout);
+	}
+}
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = { cpu_task },
+	.cpu_funcs_name = { "cpu_task" },
+	.nbuffers = 0
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, worldsize;
+	int mpi_init;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	if (worldsize < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	if (rank == 0)
+	{
+		printf("Times in us\n");
+		printf("# worker | size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
+	}
+	else if (rank >= 2)
+	{
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return 0;
+	}
+
+
+	unsigned cpu_count = starpu_cpu_worker_get_count();
+	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
+	unsigned tag = 0;
+
+	int* workers = malloc(cpu_count * sizeof(int));
+	float** vectors_send = malloc(cpu_count * sizeof(float*));
+	float** vectors_recv = malloc(cpu_count * sizeof(float*));
+	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
+	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
+
+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	{
+		starpu_pause();
+
+		for (int i = 0; i < cpu_count; i++)
+		{
+			workers[i] = i;
+			vectors_send[i] = malloc(s);
+			vectors_recv[i] = malloc(s);
+			memset(vectors_send[i], 0, s);
+			memset(vectors_recv[i], 0, s);
+
+			starpu_vector_data_register(&handles_send[i], STARPU_MAIN_RAM, (uintptr_t) vectors_send[i], s, 1);
+			starpu_vector_data_register(&handles_recv[i], STARPU_MAIN_RAM, (uintptr_t) vectors_recv[i], s, 1);
+
+			starpu_task_insert(&cl,
+					STARPU_EXECUTE_ON_WORKER, workers[i],
+					STARPU_VALUE, &rank, sizeof(int),
+					STARPU_VALUE, workers + i, sizeof(int),
+					STARPU_VALUE, &s, sizeof(uint64_t),
+					STARPU_VALUE, &handles_send[i], sizeof(starpu_data_handle_t),
+					STARPU_VALUE, &handles_recv[i], sizeof(starpu_data_handle_t), 0);
+		}
+
+		starpu_resume();
+		starpu_task_wait_for_all();
+
+		for (unsigned i = 0; i < cpu_count; i++)
+		{
+			starpu_data_unregister(handles_send[i]);
+			starpu_data_unregister(handles_recv[i]);
+			free(vectors_send[i]);
+			free(vectors_recv[i]);
+		}
+	}
+
+	free(workers);
+	free(vectors_send);
+	free(vectors_recv);
+	free(handles_send);
+	free(handles_recv);
+	free(mpi_tags);
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}

+ 46 - 0
src/core/perfmodel/perfmodel.c

@@ -81,6 +81,20 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 }
 }
 
 
 /*
 /*
+ * PER WORKER model
+ */
+
+static double per_worker_task_expected_perf(struct starpu_perfmodel *model, unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	double (*worker_cost_function)(struct starpu_task *task, unsigned workerid, unsigned nimpl);
+
+	worker_cost_function = model->worker_cost_function;
+	STARPU_ASSERT_MSG(worker_cost_function, "STARPU_PER_WORKER needs worker_cost_function to be defined");
+
+	return worker_cost_function(task, workerid, nimpl);
+}
+
+/*
  * PER ARCH model
  * PER ARCH model
  */
  */
 
 
@@ -156,6 +170,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
 
 	switch (model->type)
 	switch (model->type)
 	{
 	{
+		case STARPU_PER_WORKER:
 		case STARPU_PER_ARCH:
 		case STARPU_PER_ARCH:
 		case STARPU_COMMON:
 		case STARPU_COMMON:
 			/* Nothing more to do than init */
 			/* Nothing more to do than init */
@@ -220,6 +235,20 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return exp_perf;
 	return exp_perf;
 }
 }
 
 
+static double starpu_model_worker_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!model)
+		return 0.0;
+
+	if (model->type == STARPU_PER_WORKER)
+		return per_worker_task_expected_perf(model, workerid, task, nimpl);
+	else
+	{
+		struct starpu_perfmodel_arch *per_arch = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
+		return starpu_model_expected_perf(task, model, per_arch, nimpl);
+	}
+}
+
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	if (!task->cl)
 	if (!task->cl)
@@ -228,6 +257,14 @@ double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 }
 
 
+double starpu_task_worker_expected_length(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->model, workerid, sched_ctx_id, nimpl);
+}
+
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	if (!task->cl)
 	if (!task->cl)
@@ -236,6 +273,15 @@ double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfm
 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->energy_model, arch, nimpl);
 }
 }
 
 
+double starpu_task_worker_expected_energy(struct starpu_task *task, unsigned workerid, unsigned sched_ctx_id, unsigned nimpl)
+{
+	if (!task->cl)
+		/* Tasks without codelet don't actually take time */
+		return 0.0;
+	return starpu_model_worker_expected_perf(task, task->cl->energy_model, workerid, sched_ctx_id, nimpl);
+
+}
+
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 					    struct starpu_perfmodel_arch* arch,
 					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 					    unsigned nimpl)

+ 5 - 2
src/sched_policies/component_sched.c

@@ -49,7 +49,6 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 	    workerid != -1;
 	    workerid != -1;
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	    workerid = starpu_bitmap_next(component->workers_in_ctx, workerid))
 	{
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 		int nimpl;
 		int nimpl;
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		for(nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 		{
@@ -59,9 +58,13 @@ int starpu_sched_component_execute_preds(struct starpu_sched_component * compone
 				double d;
 				double d;
 				can_execute = 1;
 				can_execute = 1;
 				if(bundle)
 				if(bundle)
+				{
+					struct starpu_perfmodel_arch* archtype =
+						starpu_worker_get_perf_archtype(workerid, component->tree->sched_ctx_id);
 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
 					d = starpu_task_bundle_expected_length(bundle, archtype, nimpl);
+				}
 				else
 				else
-					d = starpu_task_expected_length(task, archtype, nimpl);
+					d = starpu_task_worker_expected_length(task, workerid, component->tree->sched_ctx_id, nimpl);
 				if(isnan(d))
 				if(isnan(d))
 				{
 				{
 					*length = d;
 					*length = d;

+ 5 - 6
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -487,7 +487,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			}
 			}
 
 
 			double exp_end;
 			double exp_end;
-			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
+			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
 			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
 
@@ -679,9 +679,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			}
 			}
 			else
 			else
 			{
 			{
-				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
 				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
-				local_energy[worker_ctx][nimpl] = starpu_task_expected_energy(task, perf_arch,nimpl);
+				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				if (conversion_time > 0.0)
 				if (conversion_time > 0.0)
 					local_task_length[worker_ctx][nimpl] += conversion_time;
 					local_task_length[worker_ctx][nimpl] += conversion_time;
@@ -1100,10 +1100,9 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 {
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
-	/* Compute the expected penality */
-	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(perf_workerid, sched_ctx_id);
 
 
-	double predicted = starpu_task_expected_length(task, perf_arch,
+	/* Compute the expected penality */
+	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
 						       starpu_task_get_implementation(task));
 						       starpu_task_get_implementation(task));
 
 
 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
 	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);

+ 11 - 1
tools/starpu_replay.c

@@ -1085,13 +1085,23 @@ int main(int argc, char **argv)
 		}
 		}
 		else if (TEST("Sizes"))
 		else if (TEST("Sizes"))
 		{
 		{
+			*ln = 0;
 			char *  buffer = s + 7;
 			char *  buffer = s + 7;
 			const char * delim = " ";
 			const char * delim = " ";
-			char * token = strtok(buffer, delim);
+			unsigned nb_parameters_line = count_number_tokens(buffer, delim); 
 			unsigned k = 0;
 			unsigned k = 0;
 
 
+			if(nb_parameters == 0)
+			{
+				nb_parameters = nb_parameters_line; 
+				arrays_managing(set_alloc_mode(nb_parameters));
+			}
+			else
+				STARPU_ASSERT(nb_parameters == nb_parameters_line);
+
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));
 
 
+			char * token = strtok(buffer, delim);
 			while (token != NULL && k < nb_parameters)
 			while (token != NULL && k < nb_parameters)
 			{
 			{
 				sizes_set[k] = strtol(token, NULL, 10);
 				sizes_set[k] = strtol(token, NULL, 10);