Browse Source

branch starpu-on-mpi-new

Corentin Salingue 8 years ago
parent
commit
64bc8c56a9
77 changed files with 3693 additions and 832 deletions
  1. 1 0
      Makefile.am
  2. 410 370
      configure.ac
  3. 6 2
      examples/Makefile.am
  4. 1 0
      examples/binary/binary.c
  5. 6 2
      examples/stencil/Makefile.am
  6. 1 1
      examples/stencil/implicit-stencil-blocks.c
  7. 3 3
      examples/stencil/implicit-stencil-kernels.c
  8. 1 1
      examples/stencil/implicit-stencil-tasks.c
  9. 8 8
      examples/stencil/implicit-stencil.c
  10. 1 1
      examples/stencil/implicit-stencil.h
  11. 3 3
      examples/stencil/stencil-kernels.c
  12. 2 2
      examples/stencil/stencil-tasks.c
  13. 8 8
      examples/stencil/stencil.c
  14. 1 1
      examples/stencil/stencil.h
  15. 3 1
      include/schedulers/starpu_heteroprio.h
  16. 7 1
      include/starpu.h
  17. 1 0
      include/starpu_config.h.in
  18. 2 1
      include/starpu_data.h
  19. 9 1
      include/starpu_data_interfaces.h
  20. 40 0
      include/starpu_mpi_ms.h
  21. 5 1
      include/starpu_task.h
  22. 3 0
      include/starpu_worker.h
  23. 16 0
      src/Makefile.am
  24. 27 14
      src/common/fxt.h
  25. 1 1
      src/core/dependencies/data_arbiter_concurrency.c
  26. 1 1
      src/core/dependencies/data_concurrency.c
  27. 5 0
      src/core/perfmodel/perfmodel.c
  28. 244 49
      src/core/perfmodel/perfmodel_bus.c
  29. 13 4
      src/core/perfmodel/perfmodel_history.c
  30. 15 1
      src/core/task.c
  31. 5 0
      src/core/task.h
  32. 274 36
      src/core/topology.c
  33. 155 9
      src/core/workers.c
  34. 29 1
      src/core/workers.h
  35. 11 8
      src/datawizard/coherency.c
  36. 1 0
      src/datawizard/coherency.h
  37. 124 0
      src/datawizard/copy_driver.c
  38. 30 3
      src/datawizard/copy_driver.h
  39. 10 2
      src/datawizard/data_request.c
  40. 20 3
      src/datawizard/datawizard.c
  41. 3 2
      src/datawizard/datawizard.h
  42. 11 0
      src/datawizard/malloc.c
  43. 1 1
      src/datawizard/memalloc.c
  44. 5 0
      src/datawizard/memory_nodes.c
  45. 8 0
      src/datawizard/memory_nodes.h
  46. 1 1
      src/datawizard/write_back.c
  47. 52 12
      src/debug/traces/starpu_fxt.c
  48. 1 3
      src/drivers/cpu/driver_cpu.c
  49. 2 4
      src/drivers/cuda/driver_cuda.c
  50. 5 0
      src/drivers/driver_common/driver_common.c
  51. 5 5
      src/drivers/mic/driver_mic_common.c
  52. 5 5
      src/drivers/mic/driver_mic_common.h
  53. 3 3
      src/drivers/mic/driver_mic_source.c
  54. 82 11
      src/drivers/mp_common/mp_common.c
  55. 88 53
      src/drivers/mp_common/mp_common.h
  56. 178 18
      src/drivers/mp_common/sink_common.c
  57. 532 130
      src/drivers/mp_common/source_common.c
  58. 22 10
      src/drivers/mp_common/source_common.h
  59. 558 0
      src/drivers/mpi/driver_mpi_common.c
  60. 59 0
      src/drivers/mpi/driver_mpi_common.h
  61. 81 0
      src/drivers/mpi/driver_mpi_sink.c
  62. 33 0
      src/drivers/mpi/driver_mpi_sink.h
  63. 343 0
      src/drivers/mpi/driver_mpi_source.c
  64. 52 0
      src/drivers/mpi/driver_mpi_source.h
  65. 2 4
      src/drivers/opencl/driver_opencl.c
  66. 6 2
      src/drivers/scc/driver_scc_common.c
  67. 3 3
      src/drivers/scc/driver_scc_common.h
  68. 7 1
      src/drivers/scc/driver_scc_sink.c
  69. 3 3
      src/drivers/scc/driver_scc_sink.h
  70. 4 4
      src/drivers/scc/driver_scc_source.c
  71. 1 0
      src/starpu_parameters.h
  72. 4 0
      src/top/starpu_top.c
  73. 7 3
      tests/Makefile.am
  74. 2 1
      tests/datawizard/copy.c
  75. 1 0
      tests/datawizard/manual_reduction.c
  76. 19 14
      tests/errorcheck/starpu_init_noworker.c
  77. 1 0
      tests/perfmodels/valid_model.c

+ 1 - 0
Makefile.am

@@ -82,6 +82,7 @@ versinclude_HEADERS = 				\
 	include/starpu_openmp.h			\
 	include/starpu_sink.h			\
 	include/starpu_mic.h			\
+	include/starpu_mpi_ms.h			\
 	include/starpu_scc.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\

+ 410 - 370
configure.ac

@@ -78,6 +78,8 @@ AC_PROG_SED
 AC_PROG_LN_S
 AC_PROG_F77
 AC_PROG_FC
+AC_PROG_GREP
+AC_PROG_EGREP
 AC_CHECK_PROGS(PROG_STAT,gstat stat)
 AC_CHECK_PROGS(PROG_DATE,gdate date)
 AC_OPENMP
@@ -94,6 +96,306 @@ if test x$enable_perf_debug = xyes; then
     enable_shared=no
 fi
 
+###############################################################################
+#                                                                             #
+#                                 Drivers                                     #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
+				[Enable the use of an OpenCL simulator])],
+				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
+if test x$enable_opencl_simulator = xyes; then
+	enable_simgrid=yes
+	AC_DEFINE(STARPU_OPENCL_SIMULATOR, [1], [Define this to enable using an OpenCL simulator])
+fi
+
+AC_ARG_WITH(simgrid-dir,
+	[AS_HELP_STRING([--with-simgrid-dir=<path>],
+	[specify SimGrid installation directory])],
+	[
+		simgrid_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], simgrid_dir=no)
+
+AC_ARG_WITH(simgrid-include-dir,
+	[AS_HELP_STRING([--with-simgrid-include-dir=<path>],
+	[specify where SimGrid headers are installed])],
+	[
+		simgrid_include_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], [simgrid_include_dir=no])
+
+AC_ARG_WITH(simgrid-lib-dir,
+	[AS_HELP_STRING([--with-simgrid-lib-dir=<path>],
+	[specify where SimGrid libraries are installed])],
+	[
+		simgrid_lib_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], [simgrid_lib_dir=no])
+
+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
+			[Enable simulating execution in simgrid])],
+			enable_simgrid=$enableval, enable_simgrid=no)
+if test x$enable_simgrid = xyes ; then
+   	if test -n "$SIMGRID_CFLAGS" ; then
+	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
+	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
+	fi
+	if test -n "$SIMGRID_LIBS" ; then
+		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
+	fi
+	if test "$simgrid_dir" != "no" ; then
+	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
+	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
+	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
+	fi
+	if test "$simgrid_include_dir" != "no" ; then
+	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
+	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
+	fi
+	if test "$simgrid_lib_dir" != "no" ; then
+	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
+	fi
+	AC_HAVE_LIBRARY([simgrid], [],
+		[
+			AC_MSG_ERROR(Simgrid support needs simgrid installed)
+		]
+	)
+	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
+	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
+	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
+	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
+	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+		    		[[
+#ifdef STARPU_HAVE_SIMGRID_MSG_H
+#include <simgrid/msg.h>
+#else
+#include <msg/msg.h>
+#endif
+				 ]],
+				[[msg_host_t foo; ]]
+			    )],
+	                 [],
+	                 [
+			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
+		         ])
+	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
+	# We won't bind or detect anything
+	with_hwloc=no
+
+	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
+	AC_LANG_PUSH([C++])
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+			  #ifdef HAVE_SIMGRID_MSG_H
+			  #include <simgrid/msg.h>
+			  #include <simgrid/host.h>
+			  #else
+			  #include <msg/msg.h>
+			  #endif
+			  ]])],,
+			  CXXFLAGS="-std=c++11 $CXXFLAGS"
+			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
+	AC_LANG_POP([C++])
+fi
+AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
+AC_SUBST(SIMGRID_CFLAGS)
+AC_SUBST(SIMGRID_LIBS)
+AC_MSG_CHECKING(whether SimGrid is enabled)
+AC_MSG_RESULT($enable_simgrid)
+
+AC_MSG_CHECKING(whether blocking drivers should be enabled)
+AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
+				enable_blocking=$enableval, enable_blocking=$enable_simgrid)
+AC_MSG_RESULT($enable_blocking)
+
+if test x$enable_blocking = xno ; then
+	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
+fi
+
+###############################################################################
+#                                                                             #
+#                                    MPI                                      #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
+                              [Disable StarPU MPI library generation])],
+            [enable_mpi=$enableval],
+            [enable_mpi=yes])
+
+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
+                              [Enable StarPU to run with the master-slave mode])],
+            use_mpi_master_slave=$enableval,
+            use_mpi_master_slave=no)
+
+#Check MPICC
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
+           [Path of the mpicc compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicc must be given a pathname)
+       else
+           mpicc_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICC=smpicc
+       else
+           DEFAULT_MPICC=mpicc
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+   ])
+
+# We test if the MPICC compiler exists
+if test ! -x $mpicc_path; then
+    #MPICC does not exists or is not executable
+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
+    use_mpi=no
+else
+    use_mpi=yes
+    if test x$enable_simgrid = xyes ; then
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
+                    [Path of the smpirun helper])],
+            [
+                if test x$withval = xyes; then
+                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
+                else
+                    smpirun_path=$withval
+                fi
+            ],
+            [
+                # nothing was specified: default value is used
+                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
+            ])
+
+    fi
+fi
+
+AC_MSG_CHECKING(mpicc path)
+AC_MSG_RESULT($mpicc_path)
+AC_SUBST(MPICC, $mpicc_path)
+
+
+#Check MPICXX/MPIC++
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
+           [Path of the mpicxx/mpic++ compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
+       else
+           mpicxx_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICXX=smpicxx
+       else
+           DEFAULT_MPICXX=mpicxx
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+       
+       # try with mpic++ if mpicxx was not found
+       if test x$mpicxx_path = xno ; then
+            DEFAULT_MPICXX=mpic++
+            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+       fi
+   ])
+
+# We test if the MPICXX/MPIC++ compiler exists
+if test ! -x $mpicxx_path; then
+    #MPICXX/MPIC++ does not exists or is not executable
+    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
+    use_mpicxx=no
+else
+    use_mpicxx=yes
+fi
+
+AC_MSG_CHECKING(mpicxx/mpic++ path)
+AC_MSG_RESULT($mpicxx_path)
+AC_SUBST(MPICXX, $mpicxx_path)
+
+
+if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
+    cc_or_mpicc=$mpicc_path
+        # For some reason, libtool uses gcc instead of mpicc when linking
+        # libstarpumpi.
+        # On Darwin (and maybe other systems ?) the linker will fail (undefined
+        # references to MPI_*). We manually add the required flags to fix this
+        # issue.
+        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+else
+    cc_or_mpicc=$CC
+fi
+
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+
+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
+				   [Enable StarPU MPI activity polling method])],
+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
+if  test x$enable_mpi_progression_hook = xyes; then
+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
+fi
+
+#We can only build MPI Master Slave if User wants it and MPI is available
+if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
+    build_mpi_master_slave=yes
+else
+    build_mpi_master_slave=no
+fi
+
+#Warn users that they cannot use both at the same time
+if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
+fi
+
+if test x$build_mpi_master_slave = xyes; then
+    AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
+    CC=$mpicc_path    
+    CCLD=$mpicc_path      
+    CXX=$mpicxx_path      
+    CXXLD=mpicxx_path    
+fi
+
+AC_ARG_WITH(mpi-master-slave-multiple-thread, [AS_HELP_STRING([--with-mpi-master-slave-multiple-thread])],
+	[AC_DEFINE([STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD], [1], [Use multiple threads to communicate with slaves])])
+
+AC_MSG_CHECKING(whether the master-slave mode should be enabled)
+AC_MSG_RESULT($build_mpi_master_slave)
+AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
+
+AC_MSG_CHECKING(maximum number of MPI master-slave devices)
+AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
+			[maximum number of MPI master-slave devices])],
+			nmaxmpidev=$enableval,
+            [
+             if test x$build_mpi_master_slave = xyes; then
+                 nmaxmpidev=4
+             else
+                 nmaxmpidev=0
+             fi
+            ])
+AC_MSG_RESULT($nmaxmpidev)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
+
+###############################################################################
+#                                                                             #
+#                                LIBTOOLS                                     #
+#                                                                             #
+###############################################################################
+
 LT_PREREQ([2.2])
 LT_INIT([win32-dll])
 
@@ -139,6 +441,85 @@ AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
 
 ###############################################################################
 #                                                                             #
+#                       Miscellaneous things for MPI                          #
+#                                                                             #
+###############################################################################
+
+# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
+AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
+running_mpi_check=no
+if test $svndir = 1 -o -d "$srcdir/.git" ; then
+    running_mpi_check=yes
+fi
+if test x$enable_mpi_check = xyes ; then
+    running_mpi_check=yes
+fi
+if test x$enable_mpi_check = xno ; then
+    running_mpi_check=no
+fi
+
+
+# Check if mpiexec is available
+AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
+            [Path of mpiexec])],
+    [
+        if test x$withval = xyes; then
+            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
+        else
+            mpiexec_path=$withval
+        fi
+    ],
+    [
+        # nothing was specified: look in the path
+        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
+    ])
+
+AC_MSG_CHECKING(whether mpiexec is available)
+AC_MSG_RESULT($mpiexec_path)
+
+# We test if MPIEXEC exists
+if test ! -x $mpiexec_path; then
+    #MPIEXEC does not exists or is not executable
+    AC_MSG_RESULT(The mpiexec script is not valid)
+        running_mpi_check=no
+        mpiexec_path=""
+fi
+
+AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
+if test x$use_mpi = xyes ; then
+    AC_MSG_CHECKING(whether MPI tests should be run)
+    AC_MSG_RESULT($running_mpi_check)
+    AC_SUBST(MPIEXEC,$mpiexec_path)
+fi
+
+#We can only build StarPU MPI Library if User wants it and MPI is available
+if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
+    build_mpi_lib=yes
+else
+    build_mpi_lib=no
+fi
+
+AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
+AC_MSG_RESULT($build_mpi_lib)
+
+AC_SUBST(USE_MPI, $build_mpi_lib)
+AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
+if test x$build_mpi_lib = xyes; then
+	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
+else
+	running_mpi_check=no
+fi
+
+AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
+			[Arguments for mpiexec])],
+	[
+		mpiexec_args=$withval
+	])
+AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
+
+
+###############################################################################
+#                                                                             #
 #                           MIC device compilation                            #
 #   (Must be done in beginning to change prefix in the whole configuration)   #
 #                                                                             #
@@ -1021,143 +1402,30 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 
-AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
-AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
-			[disable asynchronous copy between CPU and MIC devices])],
-			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
-disable_asynchronous_mic_copy=no
-if test x$enable_asynchronous_mic_copy = xno ; then
-   disable_asynchronous_mic_copy=yes
-fi
-AC_MSG_RESULT($disable_asynchronous_mic_copy)
-if test x$disable_asynchronous_mic_copy = xyes ; then
-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
-fi
-
-###############################################################################
-#                                                                             #
-#                                 Drivers                                     #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
-				[Enable the use of an OpenCL simulator])],
-				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
-if test x$enable_opencl_simulator = xyes; then
-	enable_simgrid=yes
-	AC_DEFINE(STARPU_OPENCL_SIMULATOR, [1], [Define this to enable using an OpenCL simulator])
-fi
-
-AC_ARG_WITH(simgrid-dir,
-	[AS_HELP_STRING([--with-simgrid-dir=<path>],
-	[specify SimGrid installation directory])],
-	[
-		simgrid_dir="$withval"
-		# in case this was not explicit yet
-		enable_simgrid=yes
-	], simgrid_dir=no)
-
-AC_ARG_WITH(simgrid-include-dir,
-	[AS_HELP_STRING([--with-simgrid-include-dir=<path>],
-	[specify where SimGrid headers are installed])],
-	[
-		simgrid_include_dir="$withval"
-		# in case this was not explicit yet
-		enable_simgrid=yes
-	], [simgrid_include_dir=no])
-
-AC_ARG_WITH(simgrid-lib-dir,
-	[AS_HELP_STRING([--with-simgrid-lib-dir=<path>],
-	[specify where SimGrid libraries are installed])],
-	[
-		simgrid_lib_dir="$withval"
-		# in case this was not explicit yet
-		enable_simgrid=yes
-	], [simgrid_lib_dir=no])
-
-AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
-			[Enable simulating execution in simgrid])],
-			enable_simgrid=$enableval, enable_simgrid=no)
-if test x$enable_simgrid = xyes ; then
-   	if test -n "$SIMGRID_CFLAGS" ; then
-	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
-	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
-	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
-	fi
-	if test -n "$SIMGRID_LIBS" ; then
-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
-	fi
-	if test "$simgrid_dir" != "no" ; then
-	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
-	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
-	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
-	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
-	fi
-	if test "$simgrid_include_dir" != "no" ; then
-	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
-	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
-	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
-	fi
-	if test "$simgrid_lib_dir" != "no" ; then
-	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
-	fi
-	AC_HAVE_LIBRARY([simgrid], [],
-		[
-			AC_MSG_ERROR(Simgrid support needs simgrid installed)
-		]
-	)
-	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
-	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
-	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
-   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
-	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
-	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
-	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
-		    		[[
-#ifdef STARPU_HAVE_SIMGRID_MSG_H
-#include <simgrid/msg.h>
-#else
-#include <msg/msg.h>
-#endif
-				 ]],
-				[[msg_host_t foo; ]]
-			    )],
-	                 [],
-	                 [
-			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
-		         ])
-	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
-	# We won't bind or detect anything
-	with_hwloc=no
-
-	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
-	AC_LANG_PUSH([C++])
-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
-			  #ifdef HAVE_SIMGRID_MSG_H
-			  #include <simgrid/msg.h>
-			  #include <simgrid/host.h>
-			  #else
-			  #include <msg/msg.h>
-			  #endif
-			  ]])],,
-			  CXXFLAGS="-std=c++11 $CXXFLAGS"
-			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
-	AC_LANG_POP([C++])
-fi
-AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
-AC_SUBST(SIMGRID_CFLAGS)
-AC_SUBST(SIMGRID_LIBS)
-AC_MSG_CHECKING(whether SimGrid is enabled)
-AC_MSG_RESULT($enable_simgrid)
-
-AC_MSG_CHECKING(whether blocking drivers should be enabled)
-AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
-				enable_blocking=$enableval, enable_blocking=$enable_simgrid)
-AC_MSG_RESULT($enable_blocking)
+AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
+AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
+			[disable asynchronous copy between CPU and MIC devices])],
+			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
+disable_asynchronous_mic_copy=no
+if test x$enable_asynchronous_mic_copy = xno ; then
+   disable_asynchronous_mic_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_mic_copy)
+if test x$disable_asynchronous_mic_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
+fi
 
-if test x$enable_blocking = xno ; then
-	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
+AC_MSG_CHECKING(whether asynchronous MPI Master Slave copy should be disabled)
+AC_ARG_ENABLE(asynchronous-mpi-master-slave-copy, [AS_HELP_STRING([--disable-asynchronous-mpi-master-slave-copy],
+			[disable asynchronous copy between MPI Master and MPI Slave devices])],
+			enable_asynchronous_mpi_master_slave_copy=$enableval, enable_asynchronous_mpi_master_slave_copy=yes)
+disable_asynchronous_mpi_master_slave_copy=no
+if test x$enable_asynchronous_mpi_master_slave_copy = xno ; then
+   disable_asynchronous_mpi_master_slave_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_mpi_master_slave_copy)
+if test x$disable_asynchronous_mpi_master_slave_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY], [1], [Define to 1 to disable asynchronous copy between MPI Master and MPI Slave devices])
 fi
 
 ###############################################################################
@@ -1733,238 +2001,6 @@ AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 
 ###############################################################################
 #                                                                             #
-#                                    MPI                                      #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
-                              [Disable StarPU MPI library generation])],
-            [enable_mpi=$enableval],
-            [enable_mpi=yes])
-
-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
-                              [Enable StarPU to run with the master-slave mode])],
-            use_mpi_master_slave=$enableval,
-            use_mpi_master_slave=no)
-
-#Check MPICC
-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
-           [Path of the mpicc compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
-       else
-           mpicc_path=$withval
-       fi
-   ],
-   [
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICC=smpicc
-       else
-           DEFAULT_MPICC=mpicc
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
-   ])
-
-# We test if the MPICC compiler exists
-if test ! -x $mpicc_path; then
-    #MPICC does not exists or is not executable
-    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
-    use_mpi=no
-else
-    use_mpi=yes
-    if test x$enable_simgrid = xyes ; then
-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
-                    [Path of the smpirun helper])],
-            [
-                if test x$withval = xyes; then
-                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
-                else
-                    smpirun_path=$withval
-                fi
-            ],
-            [
-                # nothing was specified: default value is used
-                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
-            ])
-
-    fi
-fi
-
-AC_MSG_CHECKING(mpicc path)
-AC_MSG_RESULT($mpicc_path)
-AC_SUBST(MPICC, $mpicc_path)
-
-
-#Check MPICXX/MPIC++
-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
-           [Path of the mpicxx/mpic++ compiler])],
-   [
-       if test x$withval = xyes; then
-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
-       else
-           mpicxx_path=$withval
-       fi
-   ],
-   [
-       if test x$enable_simgrid = xyes ; then
-           DEFAULT_MPICXX=smpicxx
-       else
-           DEFAULT_MPICXX=mpicxx
-       fi
-       # nothing was specified: default value is used
-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
-       
-       # try with mpic++ if mpicxx was not found
-       if test x$mpicxx_path = xno ; then
-            DEFAULT_MPICXX=mpic++
-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
-       fi
-   ])
-
-# We test if the MPICXX/MPIC++ compiler exists
-if test ! -x $mpicxx_path; then
-    #MPICXX/MPIC++ does not exists or is not executable
-    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
-    use_mpicxx=no
-else
-    use_mpicxx=yes
-fi
-
-AC_MSG_CHECKING(mpicxx/mpic++ path)
-AC_MSG_RESULT($mpicxx_path)
-AC_SUBST(MPICXX, $mpicxx_path)
-
-
-if test x$use_mpi = xyes -a x$enable_mpi = xyes; then
-    cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-    cc_or_mpicc=$CC
-fi
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
-
-# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
-AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
-running_mpi_check=no
-if test $svndir = 1 -o -d "$srcdir/.git" ; then
-    running_mpi_check=yes
-fi
-if test x$enable_mpi_check = xyes ; then
-    running_mpi_check=yes
-fi
-if test x$enable_mpi_check = xno ; then
-    running_mpi_check=no
-fi
-
-
-# Check if mpiexec is available
-AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
-            [Path of mpiexec])],
-    [
-        if test x$withval = xyes; then
-            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
-        else
-            mpiexec_path=$withval
-        fi
-    ],
-    [
-        # nothing was specified: look in the path
-        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
-    ])
-
-AC_MSG_CHECKING(whether mpiexec is available)
-AC_MSG_RESULT($mpiexec_path)
-
-# We test if MPIEXEC exists
-if test ! -x $mpiexec_path; then
-    #MPIEXEC does not exists or is not executable
-    AC_MSG_RESULT(The mpiexec script is not valid)
-        running_mpi_check=no
-        mpiexec_path=""
-fi
-
-AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$use_mpi = xyes ; then
-    AC_MSG_CHECKING(whether MPI tests should be run)
-    AC_MSG_RESULT($running_mpi_check)
-    AC_SUBST(MPIEXEC,$mpiexec_path)
-fi
-
-#We can only build StarPU MPI Library if User wants it and MPI is available
-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
-    build_mpi_lib=yes
-else
-    build_mpi_lib=no
-fi
-
-AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
-AC_MSG_RESULT($build_mpi_lib)
-
-AC_SUBST(USE_MPI, $build_mpi_lib)
-AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
-if test x$build_mpi_lib = xyes; then
-	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
-else
-	running_mpi_check=no
-fi
-
-AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
-			[Arguments for mpiexec])],
-	[
-		mpiexec_args=$withval
-	])
-AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
-
-AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
-				   [Enable StarPU MPI activity polling method])],
-				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
-if  test x$enable_mpi_progression_hook = xyes; then
-	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
-fi
-
-#We can only build MPI Master Slave if User wants it and MPI is available
-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
-    build_mpi_master_slave=yes
-else
-    build_mpi_master_slave=no
-fi
-
-if test x$build_mpi_master_slave = xyes; then
-    AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
-    CC=$mpicc_path    
-    CCLD=$mpicc_path      
-    CXX=$mpicxx_path      
-    CXXLD=mpicxx_path    
-fi
-
-AC_MSG_CHECKING(whether the master-slave mode should be enabled)
-AC_MSG_RESULT($build_mpi_master_slave)
-AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
-
-AC_MSG_CHECKING(maximum number of MPI master-slave devices)
-AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
-			[maximum number of MPI master-slave devices])],
-			nmaxmpidev=$enableval,
-            [
-             if test x$build_mpi_master_slave = xyes; then
-                 nmaxmpidev=4
-             else
-                 nmaxmpidev=0
-             fi
-            ])
-AC_MSG_RESULT($nmaxmpidev)
-AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
-
-
-###############################################################################
-#                                                                             #
 #                  Miscellaneous options for StarPU                           #
 #                                                                             #
 ###############################################################################
@@ -2087,9 +2123,13 @@ if test x$enable_simgrid != xyes; then
 	if test x$enable_rcce != xyes; then
 		nmaxsccdev=0
 	fi
+    #By default, if we cannot build mpi master-slave nmaxmpidev is set to zero.
+    #But with the multiplication with maxcpus, we need to put it to one.
+    if test x$build_mpi_master_slave != xyes; then
+        nmaxmpidev=1
+    fi
 fi
-#We suppose Master adds nmaxmpidev workers but slaves don't.
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxmpidev + $nmaxsccdev + 15 \) / 16 \) `
+nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])

+ 6 - 2
examples/Makefile.am

@@ -178,11 +178,15 @@ LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
 endif
 
+if STARPU_USE_MPI_MASTER_SLAVE
+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
+endif
+
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
-LOG_COMPILER		=	$(LOADER_BIN)
+LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
 endif

+ 1 - 0
examples/binary/binary.c

@@ -127,6 +127,7 @@ int main(int argc, char **argv)
 	conf.ncuda = 0;
 	conf.nmic = 0;
 	conf.nscc = 0;
+	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);
 	if (STARPU_UNLIKELY(ret == -ENODEV))

+ 6 - 2
examples/stencil/Makefile.am

@@ -79,11 +79,15 @@ LOADER			=
 LOADER_BIN		=	$(top_builddir)/examples/stencil/loader-cross.sh
 endif
 
+if STARPU_USE_MPI_MASTER_SLAVE
+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
+endif
+
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
-LOG_COMPILER		=	$(LOADER_BIN)
+LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
 endif

+ 1 - 1
examples/stencil/implicit-stencil-blocks.c

@@ -333,7 +333,7 @@ void allocate_memory_on_node(int rank)
 			starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
 		}
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI)  && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 		/* Register all data to StarPU-MPI, even the ones that are not
 		 * allocated on the local node. */
 

+ 3 - 3
examples/stencil/implicit-stencil-kernels.c

@@ -192,7 +192,7 @@ static void update_func_cuda(void *descr[], void *arg)
 		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
 	else
 		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d              !!!\n", rank);
@@ -282,7 +282,7 @@ static void update_func_opencl(void *descr[], void *arg)
 		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
 	else
 		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d              !!!\n", rank);
@@ -355,7 +355,7 @@ void update_func_cpu(void *descr[], void *arg)
 		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
 	else
 		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d            !!!\n", rank);

+ 1 - 1
examples/stencil/implicit-stencil-tasks.c

@@ -35,7 +35,7 @@
 # define DEBUG(fmt, ...)
 #endif
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 #include <starpu_mpi.h>
 #define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
 #endif

+ 8 - 8
examples/stencil/implicit-stencil.c

@@ -193,7 +193,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 
 unsigned global_workerid(unsigned local_workerid)
 {
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	unsigned workers_per_node = starpu_worker_get_count();
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
 	int world_size;
 	int ret;
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int thread_support;
 	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
 	{
@@ -237,7 +237,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 #endif
@@ -249,7 +249,7 @@ int main(int argc, char **argv)
 
 	init_problem(argc, argv, rank, world_size);
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 #endif
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 
 	end = starpu_timing_now();
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 #endif
@@ -288,7 +288,7 @@ int main(int argc, char **argv)
 	double max_timing = timing;
 	double sum_timing = timing;
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int reduce_ret;
 
 	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
@@ -368,13 +368,13 @@ int main(int argc, char **argv)
 
 	free_problem(rank);
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	starpu_mpi_shutdown();
 #endif
 
 	starpu_shutdown();
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	MPI_Finalize();
 #endif
 

+ 1 - 1
examples/stencil/implicit-stencil.h

@@ -23,7 +23,7 @@
 #include <starpu.h>
 
 #ifndef __CUDACC__
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 #include <mpi.h>
 #include <starpu_mpi.h>
 #endif

+ 3 - 3
examples/stencil/stencil-kernels.c

@@ -189,7 +189,7 @@ static void update_func_cuda(void *descr[], void *arg)
 		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
 	else
 		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d              !!!\n", rank);
@@ -276,7 +276,7 @@ static void update_func_opencl(void *descr[], void *arg)
 		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
 	else
 		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d              !!!\n", rank);
@@ -346,7 +346,7 @@ void update_func_cpu(void *descr[], void *arg)
 		FPRINTF(stderr,"!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
 	else
 		DEBUG( "!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	DEBUG( "!!!           RANK %d            !!!\n", rank);

+ 2 - 2
examples/stencil/stencil-tasks.c

@@ -82,7 +82,7 @@ static void send_done(void *arg)
 	DEBUG("DO SEND %d\n", (int)z);
 }
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 /* Post MPI send */
 static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
 {
@@ -138,7 +138,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
 	int node_z = get_block_mpi_node(z);
 	int node_z_and_d = get_block_mpi_node(z+dir);
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	if (node_z == local_rank)
 	{
 		/* Save data from update */

+ 8 - 8
examples/stencil/stencil.c

@@ -188,7 +188,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 
 unsigned global_workerid(unsigned local_workerid)
 {
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int rank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	unsigned workers_per_node = starpu_worker_get_count();
@@ -205,7 +205,7 @@ int main(int argc, char **argv)
 	int world_size;
 	int ret;
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int thread_support;
 	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
 	{
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	ret = starpu_mpi_init(NULL, NULL, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 #endif
@@ -246,7 +246,7 @@ int main(int argc, char **argv)
 
 	create_tasks(rank);
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 #endif
@@ -263,7 +263,7 @@ int main(int argc, char **argv)
 
 	end = starpu_timing_now();
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 #endif
@@ -274,7 +274,7 @@ int main(int argc, char **argv)
 
 	/*display_debug(nbz, niter, rank);*/
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	starpu_mpi_shutdown();
 #endif
 
@@ -285,7 +285,7 @@ int main(int argc, char **argv)
 	double max_timing = timing;
 	double sum_timing = timing;
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	int reduce_ret;
 
 	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
@@ -366,7 +366,7 @@ int main(int argc, char **argv)
 	free_problem(rank);
 	starpu_shutdown();
 
-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 	MPI_Finalize();
 #endif
 

+ 1 - 1
examples/stencil/stencil.h

@@ -23,7 +23,7 @@
 #include <starpu.h>
 
 #ifndef __CUDACC__
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 #include <mpi.h>
 #include <starpu_mpi.h>
 #endif

+ 3 - 1
include/schedulers/starpu_heteroprio.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015  INRIA
+ * Copyright (C) 2015, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,7 @@ enum starpu_heteroprio_types
 	STARPU_CUDA_IDX,
 	STARPU_OPENCL_IDX,
 	STARPU_MIC_IDX,
+	STARPU_MPI_MS_IDX,
 	STARPU_SCC_IDX,
 // This will be the number of archs
 	STARPU_NB_TYPES
@@ -54,6 +55,7 @@ static const unsigned starpu_heteroprio_types_to_arch[STARPU_NB_TYPES+1] =
 	STARPU_OPENCL,
 	STARPU_MIC,
 	STARPU_SCC,
+        STARPU_MPI_MS,
 	0
 };
 

+ 7 - 1
include/starpu.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2014, 2016  Université de Bordeaux
  * Copyright (C) 2010-2015  CNRS
- * Copyright (C) 2014  INRIA
+ * Copyright (C) 2014, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -90,6 +90,7 @@ struct starpu_conf
 	int nopencl;
 	int nmic;
 	int nscc;
+        int nmpi_ms;
 
 	unsigned use_explicit_workers_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
@@ -106,6 +107,9 @@ struct starpu_conf
 	unsigned use_explicit_workers_scc_deviceid;
 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
 
+	unsigned use_explicit_workers_mpi_deviceid;
+	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
+
 	int bus_calibrate;
 	int calibrate;
 
@@ -117,6 +121,7 @@ struct starpu_conf
 	int disable_asynchronous_cuda_copy;
 	int disable_asynchronous_opencl_copy;
 	int disable_asynchronous_mic_copy;
+	int disable_asynchronous_mpi_ms_copy;
 
 	unsigned *cuda_opengl_interoperability;
 	unsigned n_cuda_opengl_interoperability;
@@ -146,6 +151,7 @@ int starpu_asynchronous_copy_disabled(void);
 int starpu_asynchronous_cuda_copy_disabled(void);
 int starpu_asynchronous_opencl_copy_disabled(void);
 int starpu_asynchronous_mic_copy_disabled(void);
+int starpu_asynchronous_mpi_ms_copy_disabled(void);
 
 void starpu_display_stats();
 

+ 1 - 0
include/starpu_config.h.in

@@ -35,6 +35,7 @@
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_MIC
 #undef STARPU_USE_SCC
+#undef STARPU_USE_MPI_MASTER_SLAVE
 
 #undef STARPU_OPENMP
 

+ 2 - 1
include/starpu_data.h

@@ -117,7 +117,8 @@ enum starpu_node_kind
 	STARPU_DISK_RAM   = 0x04,
 	STARPU_MIC_RAM    = 0x05,
 	STARPU_SCC_RAM    = 0x06,
-	STARPU_SCC_SHM    = 0x07
+	STARPU_SCC_SHM    = 0x07,
+	STARPU_MPI_MS_RAM = 0x08
 
 };
 

+ 9 - 1
include/starpu_data_interfaces.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2014  CNRS
- * Copyright (C) 2011-2012  INRIA
+ * Copyright (C) 2011-2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,6 +59,10 @@ struct starpu_data_copy_methods
 	int (*scc_sink_to_src)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*scc_sink_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 
+	int (*ram_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*mpi_ms_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*mpi_ms_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+
 #ifdef STARPU_USE_CUDA
 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, starpu_cudaStream_t stream);
 	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, starpu_cudaStream_t stream);
@@ -79,6 +83,10 @@ struct starpu_data_copy_methods
 	int (*opencl_to_opencl_async)();
 #endif
 
+	int (*ram_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
+	int (*mpi_ms_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
+	int (*mpi_ms_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
+
 	int (*ram_to_mic_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
 

+ 40 - 0
include/starpu_mpi_ms.h

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_MS_H__
+#define __STARPU_MPI_MS_H__
+
+#include <starpu_config.h>
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef void *starpu_mpi_ms_func_symbol_t;
+
+int starpu_mpi_ms_register_kernel(starpu_mpi_ms_func_symbol_t *symbol, const char *func_name);
+
+starpu_mpi_ms_kernel_t starpu_mpi_ms_get_kernel(starpu_mpi_ms_func_symbol_t symbol);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* STARPU_USE_MIC */
+#endif /* __STARPU_MIC_H__ */

+ 5 - 1
include/starpu_task.h

@@ -3,7 +3,7 @@
  * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  * Copyright (C) 2011  Télécom-SudParis
- * Copyright (C) 2011, 2014  INRIA
+ * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,6 +43,7 @@ extern "C"
 #define STARPU_OPENCL	((1ULL)<<6)
 #define STARPU_MIC	((1ULL)<<7)
 #define STARPU_SCC	((1ULL)<<8)
+#define STARPU_MPI_MS	((1ULL)<<9)
 
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
 #define STARPU_CUDA_ASYNC	(1<<0)
@@ -75,9 +76,11 @@ typedef void (*starpu_cpu_func_t)(void **, void*);
 typedef void (*starpu_cuda_func_t)(void **, void*);
 typedef void (*starpu_opencl_func_t)(void **, void*);
 typedef void (*starpu_mic_kernel_t)(void **, void*);
+typedef void (*starpu_mpi_ms_kernel_t)(void **, void*);
 typedef void (*starpu_scc_kernel_t)(void **, void*);
 
 typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
+typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 
 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
@@ -104,6 +107,7 @@ struct starpu_codelet
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
 	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_mpi_ms_func_t mpi_ms_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
 
 	const char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];

+ 3 - 0
include/starpu_worker.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
  * Copyright (C) 2010-2014  CNRS
+ * Copyright (C) 2016  INRIA
  * Copyright (C) 2016  Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -35,6 +36,7 @@ enum starpu_worker_archtype
 	STARPU_OPENCL_WORKER,
 	STARPU_MIC_WORKER,
 	STARPU_SCC_WORKER,
+	STARPU_MPI_WORKER,
 	STARPU_ANY_WORKER
 };
 
@@ -89,6 +91,7 @@ unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 unsigned starpu_mic_worker_get_count(void);
 unsigned starpu_scc_worker_get_count(void);
+unsigned starpu_mpi_ms_worker_get_count(void);
 
 unsigned starpu_mic_device_get_count(void);
 

+ 16 - 0
src/Makefile.am

@@ -136,6 +136,9 @@ noinst_HEADERS = 						\
 	drivers/scc/driver_scc_common.h				\
 	drivers/scc/driver_scc_source.h				\
 	drivers/scc/driver_scc_sink.h				\
+	drivers/mpi/driver_mpi_common.h				\
+	drivers/mpi/driver_mpi_source.h				\
+	drivers/mpi/driver_mpi_sink.h				\
 	drivers/disk/driver_disk.h				\
 	debug/traces/starpu_fxt.h				\
 	profiling/bound.h					\
@@ -368,6 +371,19 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.
 endif
 
 #########################################
+#                                       # 	 
+#     MPI Master/Slave compilation      # 	 
+#                                       # 	 
+######################################### 	 
+
+if STARPU_USE_MPI_MASTER_SLAVE 	 
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_common.c 	 
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_source.c 	 
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_sink.c 	 
+endif 	 
+
+
+#########################################
 
 showcheck:
 	-cat /dev/null

+ 27 - 14
src/common/fxt.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -40,6 +41,7 @@
 #define _STARPU_FUT_OPENCL_KEY	0x103
 #define _STARPU_FUT_MIC_KEY	0x104
 #define _STARPU_FUT_SCC_KEY	0x105
+#define _STARPU_FUT_MPI_KEY	0x106
 
 #define _STARPU_FUT_WORKER_INIT_START	0x5100
 #define _STARPU_FUT_WORKER_INIT_END	0x5101
@@ -52,10 +54,10 @@
 
 #define _STARPU_FUT_UPDATE_TASK_CNT	0x5106
 
-#define _STARPU_FUT_START_FETCH_INPUT	0x5107
-#define _STARPU_FUT_END_FETCH_INPUT	0x5108
-#define _STARPU_FUT_START_PUSH_OUTPUT	0x5109
-#define _STARPU_FUT_END_PUSH_OUTPUT	0x5110
+#define _STARPU_FUT_START_FETCH_INPUT_ON_TID	0x5107
+#define _STARPU_FUT_END_FETCH_INPUT_ON_TID	0x5108
+#define _STARPU_FUT_START_PUSH_OUTPUT_ON_TID	0x5109
+#define _STARPU_FUT_END_PUSH_OUTPUT_ON_TID	0x5110
 
 #define _STARPU_FUT_TAG		0x5111
 #define _STARPU_FUT_TAG_DEPS	0x5112
@@ -104,8 +106,8 @@
 #define	_STARPU_FUT_START_DRIVER_COPY_ASYNC	0x5135
 #define	_STARPU_FUT_END_DRIVER_COPY_ASYNC	0x5136
 
-#define	_STARPU_FUT_START_PROGRESS	0x5137
-#define	_STARPU_FUT_END_PROGRESS		0x5138
+#define	_STARPU_FUT_START_PROGRESS_ON_TID	0x5137
+#define	_STARPU_FUT_END_PROGRESS_ON_TID		0x5138
 
 #define _STARPU_FUT_USER_EVENT		0x5139
 
@@ -151,8 +153,8 @@
 
 #define _STARPU_FUT_DATA_LOAD 0x5153
 
-#define _STARPU_FUT_START_UNPARTITION 0x5154
-#define _STARPU_FUT_END_UNPARTITION 0x5155
+#define _STARPU_FUT_START_UNPARTITION_ON_TID 0x5154
+#define _STARPU_FUT_END_UNPARTITION_ON_TID 0x5155
 
 #define	_STARPU_FUT_START_FREE		0x5156
 #define	_STARPU_FUT_END_FREE		0x5157
@@ -209,6 +211,9 @@
 #define _STARPU_FUT_HANDLE_DATA_REGISTER 0x517c
 #define _STARPU_FUT_DATA_INVALIDATE 0x517d
 
+#define _STARPU_FUT_START_FETCH_INPUT	0x517e
+#define _STARPU_FUT_END_FETCH_INPUT	0x517f
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -525,16 +530,22 @@ do {									\
 	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
 
 #define _STARPU_TRACE_START_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, _starpu_gettid());
+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, _starpu_gettid());
+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_START_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, _starpu_gettid());
+	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, _starpu_gettid());
+	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
+
+#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	\
+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, id);
+
+#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	\
+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, id);
 
 #define _STARPU_TRACE_TAG(tag, job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_TAG, tag, (job)->job_id)
@@ -903,10 +914,10 @@ do {										\
 	FUT_DO_PROBE2(_STARPU_FUT_DATA_LOAD, workerid, size);
 
 #define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
-	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION, memnode, _starpu_gettid(), handle);
+	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
 	
 #define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
-	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION, memnode, _starpu_gettid(), handle);
+	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(workerid, ntasks, exp_len)		\
 	FUT_DO_PROBE4(_STARPU_FUT_SCHED_COMPONENT_PUSH_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
@@ -1046,6 +1057,8 @@ do {										\
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {} while (0)
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {} while (0)
 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {} while (0)
+#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	do {} while(0)
+#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	do {} while(0)
 
 #endif // STARPU_USE_FXT
 

+ 1 - 1
src/core/dependencies/data_arbiter_concurrency.c

@@ -271,7 +271,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+			_starpu_datawizard_progress(0);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 1 - 1
src/core/dependencies/data_concurrency.c

@@ -130,7 +130,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+			_starpu_datawizard_progress(0);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 5 - 0
src/core/perfmodel/perfmodel.c

@@ -123,6 +123,8 @@ double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arc
 			coef = _STARPU_MIC_ALPHA;
 		else if (perf_arch->devices[dev].type == STARPU_SCC_WORKER)
 			coef = _STARPU_SCC_ALPHA;
+		else if (perf_arch->devices[dev].type == STARPU_MPI_WORKER)
+			coef = _STARPU_MPI_MS_ALPHA;
 
 		speedup += coef * (perf_arch->devices[dev].ncores);
 	}
@@ -263,6 +265,9 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 			case STARPU_SCC_WORKER:
 				node_kind = STARPU_SCC_RAM;
 				break;
+			case STARPU_MPI_WORKER:
+				node_kind = STARPU_MPI_MS_RAM;
+				break;
 			default:
 				STARPU_ABORT();
 				break;

+ 244 - 49
src/core/perfmodel/perfmodel_bus.c

@@ -37,6 +37,7 @@
 #include <core/simgrid.h>
 #include <core/topology.h>
 #include <common/utils.h>
+#include <drivers/mpi/driver_mpi_common.h>
 
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -77,6 +78,7 @@ static unsigned ncpus = 0;
 static unsigned ncuda = 0;
 static unsigned nopencl = 0;
 static unsigned nmic = 0;
+static unsigned nmpi_ms = 0;
 
 /* Benchmarking the performance of the bus */
 
@@ -121,6 +123,11 @@ static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
 static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
 #endif /* STARPU_USE_MIC */
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+static double mpi_time_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
+static double mpi_latency_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
+#endif
+
 #ifdef STARPU_HAVE_HWLOC
 static hwloc_topology_t hwtopology;
 #endif
@@ -663,7 +670,7 @@ static void benchmark_all_gpu_devices(void)
 	_STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
 	STARPU_ABORT();
 #else /* !SIMGRID */
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 	unsigned i;
 #endif
 #ifdef HAVE_CUDA_MEMCPY_PEER
@@ -739,6 +746,12 @@ static void benchmark_all_gpu_devices(void)
 	}
 #endif /* STARPU_USE_MIC */
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+    
+        _starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
+
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 	hwloc_bitmap_free(former_cpuset);
@@ -928,6 +941,12 @@ static void generate_bus_affinity_file(void)
 	if (!was_benchmarked)
 		benchmark_all_gpu_devices();
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* Slaves don't write files */
+        if (!_starpu_mpi_common_is_src_node())
+                return;
+#endif
+
 	write_bus_affinity_file_content();
 }
 
@@ -1145,6 +1164,9 @@ static void write_bus_latency_file_content(void)
 #ifdef STARPU_USE_MIC
         maxnode += nmic;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        maxnode += nmpi_ms;
+#endif
         for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1177,12 +1199,48 @@ static void write_bus_latency_file_content(void)
 				}
 #endif
 #ifdef STARPU_USE_OPENCL
-				if (src > ncuda)
+				if (src > ncuda && src <= ncuda + nopencl)
 					latency += opencldev_latency_dtoh[src-ncuda];
-				if (dst > ncuda)
+				if (dst > ncuda && dst <= ncuda + nopencl)
 					latency += opencldev_latency_htod[dst-ncuda];
 #endif
-			}
+                                /* TODO Latency MIC */
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                                /* Modify MPI src and MPI dst if they contain the master node or not 
+                                 * Because, we only take care about slaves */
+                                int mpi_master = _starpu_mpi_common_get_src_node();
+
+                                int mpi_src = src - (ncuda + nopencl + nmic) - 1;
+                                mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
+
+                                int mpi_dst = dst - (ncuda + nopencl + nmic) - 1;
+                                mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
+
+                                if (src > ncuda + nopencl + nmic && src <= ncuda + nopencl + nmic + nmpi_ms)
+                                {
+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
+                                        {
+                                                /* src and dst identify 2 MPI devices */
+                                                latency += mpi_latency_device_to_device[mpi_src][mpi_dst];
+                                        }
+                                        else
+                                        {
+                                                /* Only src represents an MPI device 
+                                                 * So we add latency between src and master */
+                                                latency += mpi_latency_device_to_device[mpi_src][mpi_master];
+                                        }
+                                }
+                                else
+                                {
+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
+                                        {
+                                                /* Only dst identifies an MPI device 
+                                                 * So we add latency between master and dst */
+                                                latency += mpi_latency_device_to_device[mpi_master][mpi_dst];
+                                        }
+                                }
+#endif
+                        }
 
 			if (dst)
 				fputc('\t', f);
@@ -1203,6 +1261,12 @@ static void generate_bus_latency_file(void)
 	if (!was_benchmarked)
 		benchmark_all_gpu_devices();
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* Slaves don't write files */
+        if (!_starpu_mpi_common_is_src_node())
+                return;
+#endif
+
 #ifndef STARPU_SIMGRID
 	write_bus_latency_file_content();
 #endif
@@ -1366,6 +1430,9 @@ static void write_bus_bandwidth_file_content(void)
 #ifdef STARPU_USE_MIC
         maxnode += nmic;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        maxnode += nmpi_ms;
+#endif
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1376,7 +1443,7 @@ static void write_bus_bandwidth_file_content(void)
 			{
 				bandwidth = NAN;
 			}
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 			else if (src != dst)
 			{
 				double slowness = 0.0;
@@ -1403,11 +1470,47 @@ static void write_bus_bandwidth_file_content(void)
 					slowness += opencldev_timing_htod[dst-ncuda];
 #endif
 #ifdef STARPU_USE_MIC
-				if (src > ncuda + nopencl)
+				if (src > ncuda + nopencl && src <= ncuda + nopencl + nmic)
 					slowness += mic_time_device_to_host[src - (ncuda + nopencl)];
-				if (dst > ncuda + nopencl)
+				if (dst > ncuda + nopencl && dst <= ncuda + nopencl + nmic)
 					slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                                /* Modify MPI src and MPI dst if they contain the master node or not 
+                                 * Because, we only take care about slaves */
+                                int mpi_master = _starpu_mpi_common_get_src_node();
+
+                                int mpi_src = src - (ncuda + nopencl + nmic) - 1;
+                                mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
+
+                                int mpi_dst = dst - (ncuda + nopencl + nmic) - 1;
+                                mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
+
+                                /* here we have bandwidth */
+                                if (src > ncuda + nopencl + nmic && src <= ncuda + nopencl + nmic + nmpi_ms)
+                                {
+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
+                                        {
+                                                /* src and dst identify 2 MPI devices */
+                                                slowness += 1.0/mpi_time_device_to_device[mpi_src][mpi_dst];
+                                        }
+                                        else
+                                        {
+                                                /* Only src represents an MPI device 
+                                                 * So we add bandwidth between src and master */
+                                                slowness += 1.0/mpi_time_device_to_device[mpi_src][mpi_master];
+                                        }
+                                }
+                                else
+                                {
+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
+                                        {
+                                                /* Only dst identifies an MPI device 
+                                                 * So we add bandwidth between master and dst */
+                                                slowness += 1.0/mpi_time_device_to_device[mpi_master][mpi_dst];
+                                        }
+                                }
+#endif
 				bandwidth = 1.0/slowness;
 			}
 #endif
@@ -1457,6 +1560,9 @@ void starpu_bus_print_bandwidth(FILE *f)
 #ifdef STARPU_USE_MIC
         maxnode += nmic;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        maxnode += nmpi_ms;
+#endif
 
 	fprintf(f, "from/to\t");
 	fprintf(f, "RAM\t");
@@ -1466,6 +1572,8 @@ void starpu_bus_print_bandwidth(FILE *f)
 		fprintf(f, "OpenCL%u\t", dst);
 	for (dst = 0; dst < nmic; dst++)
 		fprintf(f, "MIC%u\t", dst);
+	for (dst = 0; dst < nmpi_ms; dst++)
+		fprintf(f, "MPI_MS%d\t", dst);
 	fprintf(f, "\n");
 
 	for (src = 0; src <= maxnode; src++)
@@ -1476,8 +1584,10 @@ void starpu_bus_print_bandwidth(FILE *f)
 			fprintf(f, "CUDA %u\t", src-1);
 		else if (src <= ncuda + nopencl)
 			fprintf(f, "OpenCL%u\t", src-ncuda-1);
-		else
+		else if (src <= ncuda + nopencl + nmic)
 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
+                else
+			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
 		for (dst = 0; dst <= maxnode; dst++)
 			fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
 
@@ -1493,8 +1603,10 @@ void starpu_bus_print_bandwidth(FILE *f)
 			fprintf(f, "CUDA %u\t", src-1);
 		else if (src <= ncuda + nopencl)
 			fprintf(f, "OpenCL%u\t", src-ncuda-1);
-		else
+		else if (src <= ncuda + nopencl + nmic)
 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
+                else
+			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
 		for (dst = 0; dst <= maxnode; dst++)
 			fprintf(f, "%.0f\t", latency_matrix[src][dst]);
 
@@ -1550,6 +1662,12 @@ static void generate_bus_bandwidth_file(void)
 {
 	if (!was_benchmarked)
 		benchmark_all_gpu_devices();
+    
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* Slaves don't write files */
+        if (!_starpu_mpi_common_is_src_node())
+                return;
+#endif
 
 #ifndef STARPU_SIMGRID
 	write_bus_bandwidth_file_content();
@@ -1580,56 +1698,125 @@ static void get_config_path(char *path, size_t maxlen)
 	get_bus_path("config", path, maxlen);
 }
 
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
+/* check if the master or one slave has to recalibrate */
+static int mpi_check_recalibrate(int my_recalibrate)
+{
+        int nb_mpi = _starpu_mpi_src_get_device_count() + 1;
+        int mpi_recalibrate[nb_mpi];
+
+        MPI_Allgather(&my_recalibrate, 1, MPI_INT, mpi_recalibrate, 1, MPI_INT, MPI_COMM_WORLD);
+
+        for (int i = 0; i < nb_mpi; i++)
+        {
+                if (mpi_recalibrate[i])
+                {
+                        return 1;
+                        break;
+                }
+        }
+        return 0;
+}
+#endif
+
+static void compare_value_and_recalibrate(char * msg, unsigned val_file, unsigned val_detected)
+{
+        int recalibrate = 0;
+        if (val_file != val_detected)
+                recalibrate = 1;
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	//Send to each other to know if we had to recalibrate because someone cannot have the correct value in the config file
+	recalibrate = mpi_check_recalibrate(recalibrate);
+#endif
+
+        if (recalibrate)
+        {
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                /* Only the master prints the message */
+                if (_starpu_mpi_common_is_src_node())
+#endif
+                        _STARPU_DISP("Current configuration does not match the bus performance model (%s: (stored) %d != (current) %d), recalibrating...\n", msg, val_file, val_detected);
+
+                _starpu_bus_force_sampling();
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                if (_starpu_mpi_common_is_src_node())
+#endif
+                        _STARPU_DISP("... done\n");
+        }
+}
+
 static void check_bus_config_file(void)
 {
         int res;
         char path[256];
         struct _starpu_machine_config *config = _starpu_get_machine_config();
+	int recalibrate = 0;
 
         get_config_path(path, sizeof(path));
         res = access(path, F_OK);
+
 	if (res || config->conf.bus_calibrate > 0)
+		recalibrate = 1;
+
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
+	//Send to each other to know if we had to recalibrate because someone cannot have the config file
+	recalibrate = mpi_check_recalibrate(recalibrate);
+#endif
+
+	if (recalibrate)
 	{
 		if (res)
 			_STARPU_DISP("No performance model for the bus, calibrating...\n");
 		_starpu_bus_force_sampling();
 		if (res)
 			_STARPU_DISP("... done\n");
-        }
-        else
+	}
+	else
 	{
                 FILE *f;
                 int ret;
-		unsigned read_cuda = -1, read_opencl = -1, read_mic = -1;
+                unsigned read_cuda = -1, read_opencl = -1, read_mic = -1, read_mpi_ms = -1;
                 unsigned read_cpus = -1;
-		int locked;
+                int locked;
 
                 // Loading configuration from file
                 f = fopen(path, "r");
                 STARPU_ASSERT(f);
-		locked = _starpu_frdlock(f) == 0;
+                locked = _starpu_frdlock(f) == 0;
                 _starpu_drop_comments(f);
+
                 ret = fscanf(f, "%u\t", &read_cpus);
-		STARPU_ASSERT(ret == 1);
+                STARPU_ASSERT(ret == 1);
                 _starpu_drop_comments(f);
-		ret = fscanf(f, "%u\t", &read_cuda);
-		STARPU_ASSERT(ret == 1);
+
+                ret = fscanf(f, "%u\t", &read_cuda);
+                STARPU_ASSERT(ret == 1);
                 _starpu_drop_comments(f);
-		ret = fscanf(f, "%u\t", &read_opencl);
-		STARPU_ASSERT(ret == 1);
+
+                ret = fscanf(f, "%u\t", &read_opencl);
+                STARPU_ASSERT(ret == 1);
                 _starpu_drop_comments(f);
-		ret = fscanf(f, "%u\t", &read_mic);
-		if (ret == 0)
-			read_mic = 0;
+
+                ret = fscanf(f, "%u\t", &read_mic);
+                if (ret == 0)
+                        read_mic = 0;
+                _starpu_drop_comments(f);
+
+                ret = fscanf(f, "%u\t", &read_mpi_ms);
+                if (ret == 0)
+                        read_mpi_ms = 0;
                 _starpu_drop_comments(f);
-		if (locked)
-			_starpu_frdunlock(f);
+
+                if (locked)
+                        _starpu_frdunlock(f);
                 fclose(f);
 
                 // Loading current configuration
                 ncpus = _starpu_topology_get_nhwcpu(config);
 #ifdef STARPU_USE_CUDA
-		ncuda = _starpu_get_cuda_device_count();
+                ncuda = _starpu_get_cuda_device_count();
 #endif
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();
@@ -1637,32 +1824,16 @@ static void check_bus_config_file(void)
 #ifdef STARPU_USE_MIC
                 nmic = _starpu_mic_src_get_device_count();
 #endif /* STARPU_USE_MIC */
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                nmpi_ms = _starpu_mpi_src_get_device_count();
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
 
                 // Checking if both configurations match
-                if (read_cpus != ncpus)
-		{
-			_STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
-                        _starpu_bus_force_sampling();
-			_STARPU_DISP("... done\n");
-                }
-                else if (read_cuda != ncuda)
-		{
-                        _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
-                        _starpu_bus_force_sampling();
-			_STARPU_DISP("... done\n");
-                }
-                else if (read_opencl != nopencl)
-		{
-                        _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
-                        _starpu_bus_force_sampling();
-			_STARPU_DISP("... done\n");
-                }
-                else if (read_mic != nmic)
-		{
-                        _STARPU_DISP("Current configuration does not match the bus performance model (MIC: (stored) %d != (current) %d), recalibrating...\n", read_mic, nmic);
-                        _starpu_bus_force_sampling();
-			_STARPU_DISP("... done\n");
-                }
+                compare_value_and_recalibrate("CPUS", read_cpus, ncpus);
+                compare_value_and_recalibrate("CUDA", read_cuda, ncuda);
+                compare_value_and_recalibrate("OpenCL", read_opencl, nopencl);
+                compare_value_and_recalibrate("MIC", read_mic, nmic);
+                compare_value_and_recalibrate("MPI Master-Slave", read_mpi_ms, nmpi_ms);
         }
 }
 
@@ -1687,6 +1858,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%u # Number of CUDA devices\n", ncuda);
         fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%u # Number of MIC devices\n", nmic);
+        fprintf(f, "%d # Number of MPI devices\n", nmpi_ms);
 
 	if (locked)
 		_starpu_fwrunlock(f);
@@ -1697,6 +1869,12 @@ static void generate_bus_config_file(void)
 {
 	if (!was_benchmarked)
 		benchmark_all_gpu_devices();
+    
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* Slaves don't write files */
+        if (!_starpu_mpi_common_is_src_node())
+                return;
+#endif
 
 	write_bus_config_file_content();
 }
@@ -2427,6 +2605,12 @@ static void generate_bus_platform_file(void)
 	if (!was_benchmarked)
 		benchmark_all_gpu_devices();
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* Slaves don't write files */
+        if (!_starpu_mpi_common_is_src_node())
+                return;
+#endif
+
 	write_bus_platform_file_content(3);
 	write_bus_platform_file_content(4);
 }
@@ -2480,12 +2664,23 @@ void _starpu_load_bus_performance_files(void)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
 	nopencl = _starpu_opencl_get_device_count();
 #endif
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_SIMGRID)
+        nmpi_ms = _starpu_mpi_src_get_device_count();
+#endif
 #if defined(STARPU_USE_MIC) || defined(STARPU_USE_SIMGRID)
 	nmic = _starpu_mic_src_get_device_count();
 #endif
 
 #ifndef STARPU_SIMGRID
         check_bus_config_file();
+#endif
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        /* be sure that master wrote the perf files */
+        _starpu_mpi_common_barrier();
+#endif
+
+#ifndef STARPU_SIMGRID
 	load_bus_affinity_file();
 #endif
 	load_bus_latency_file();

+ 13 - 4
src/core/perfmodel/perfmodel_history.c

@@ -568,6 +568,8 @@ static enum starpu_worker_archtype _get_enum_type(int type)
 			return STARPU_MIC_WORKER;
         	case 4:
 			return STARPU_SCC_WORKER;
+        	case 5:
+			return STARPU_MPI_WORKER;
 		default:
 			STARPU_ABORT();
 	}
@@ -715,7 +717,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 		{
 			fprintf(f, "####################\n");
 			fprintf(f, "# DEV_%d\n", dev);
-			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4)\n");
+			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4, MPI_MS - 5)\n");
 			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].type);
 
 			fprintf(f, "####################\n");
@@ -904,11 +906,14 @@ void _starpu_initialize_registered_performance_models(void)
 	unsigned i;
 	for(i = 0; i < conf->topology.nhwmicdevices; i++)
 		nmic += conf->topology.nhwmiccores[i];
+	unsigned nmpi = 0;
+	for(i = 0; i < conf->topology.nhwmpidevices; i++)
+		nmpi += conf->topology.nhwmpicores[i];
 	unsigned nscc = conf->topology.nhwscc;
 
-	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nscc), this is too big
-	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nscc), and reallocate when necessary in starpu_perfmodel_arch_comb_add
-	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nscc);
+	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nscc + nmpi), this is too big
+	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nscc + nmpi), and reallocate when necessary in starpu_perfmodel_arch_comb_add
+	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nscc + nmpi);
 	_STARPU_MALLOC(arch_combs, nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
 	current_arch_comb = 0;
 	STARPU_PTHREAD_RWLOCK_INIT(&arch_combs_mutex, NULL);
@@ -918,6 +923,7 @@ void _starpu_initialize_registered_performance_models(void)
 	ignore_devid[STARPU_CUDA_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CUDA", 0);
 	ignore_devid[STARPU_OPENCL_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL", 0);
 	ignore_devid[STARPU_MIC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MIC", 0);
+	ignore_devid[STARPU_MPI_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS", 0);
 	ignore_devid[STARPU_SCC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_SCC", 0);
 }
 
@@ -1200,6 +1206,9 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 		case(STARPU_SCC_WORKER):
 			return "scc";
 			break;
+		case(STARPU_MPI_WORKER):
+			return "mpi_ms";
+			break;
 		default:
 			STARPU_ABORT();
 			break;

+ 15 - 1
src/core/task.c

@@ -485,6 +485,18 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 
 	some_impl = 0;
 	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
+		if (cl->mpi_ms_funcs[i])
+		{
+			some_impl = 1;
+			break;
+		}
+	if (some_impl && is_where_unset)
+	{
+		cl->where |= STARPU_MPI_MS;
+	}
+
+	some_impl = 0;
+	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
 		if (cl->scc_funcs[i])
 		{
 			some_impl = 1;
@@ -504,7 +516,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 		}
 	if (some_impl && is_where_unset)
 	{
-		cl->where |= STARPU_MIC|STARPU_SCC;
+		cl->where |= STARPU_MIC|STARPU_SCC|STARPU_MPI_MS;
 	}
 }
 
@@ -1146,6 +1158,7 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 				case STARPU_CUDA_RAM:      /* Fall through */
 				case STARPU_OPENCL_RAM:
 				case STARPU_MIC_RAM:
+                                case STARPU_MPI_MS_RAM:
 				case STARPU_SCC_RAM:
 					return 1;
 				default:
@@ -1163,6 +1176,7 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 				case STARPU_CUDA_RAM:
 				case STARPU_OPENCL_RAM:
 				case STARPU_MIC_RAM:
+                                case STARPU_MPI_MS_RAM:
 				case STARPU_SCC_RAM:
 					return 0;
 				default:

+ 5 - 0
src/core/task.h

@@ -111,6 +111,11 @@ static inline starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct s
 	return cl->mic_funcs[nimpl];
 }
 
+static inline starpu_mpi_ms_func_t _starpu_task_get_mpi_ms_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->mpi_ms_funcs[nimpl];
+}
+
 static inline starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
 {
 	return cl->scc_funcs[nimpl];

+ 274 - 36
src/core/topology.c

@@ -26,6 +26,8 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/mic/driver_mic_source.h>
 #include <drivers/scc/driver_scc_source.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mp_common/source_common.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <profiling/profiling.h>
@@ -58,7 +60,7 @@ static int nobind;
 /* For checking whether two workers share the same PU, indexed by PU number */
 static int cpu_worker[STARPU_MAXCPUS];
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 
 struct handle_entry
 {
@@ -81,6 +83,9 @@ static struct _starpu_worker_set cuda_worker_set[STARPU_MAXCUDADEVS];
 #ifdef STARPU_USE_MIC
 static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
+#endif
 
 void *
 _starpu_get_worker_from_driver(struct starpu_driver *d)
@@ -138,7 +143,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
  * Discover the topology of the machine
  */
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 static void
 _starpu_initialize_workers_deviceid (int *explicit_workers_gpuid,
 				  int *current, int *workers_gpuid,
@@ -395,6 +400,31 @@ static inline int _starpu_get_next_scc_deviceid(struct _starpu_machine_config *c
 }
 #endif
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+static inline int _starpu_get_next_mpi_deviceid(struct _starpu_machine_config *config)
+{
+	unsigned i = ((config->current_mpi_deviceid++) % config->topology.nmpidevices);
+
+	return (int)config->topology.workers_mpi_deviceid[i];
+}
+
+static void
+_starpu_init_mpi_topology (struct _starpu_machine_config *config, long mpi_idx)
+{
+	/* Discover the topology of the mpi node identifier by MPI_IDX. That
+	 * means, make this StarPU instance aware of the number of cores available
+	 * on this MPI device. Update the `nhwmpicores' topology field
+	 * accordingly. */
+
+	struct _starpu_machine_topology *topology = &config->topology;
+
+	int nbcores;
+	_starpu_src_common_sink_nbcores (mpi_ms_nodes[mpi_idx], &nbcores);
+	topology->nhwmpicores[mpi_idx] = nbcores;
+}
+
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
 #ifdef STARPU_USE_MIC
 static void
 _starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
@@ -583,6 +613,9 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 #ifdef STARPU_USE_SCC
 	config->topology.nhwscc = _starpu_scc_src_get_device_count();
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE 
+        config->topology.nhwmpi = _starpu_mpi_src_get_device_count();
+#endif
 
 	topology_is_initialized = 1;
 }
@@ -870,16 +903,75 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 	}
 
 	topology->nworkers += topology->nmiccores[mic_idx];
-    }
+}  
 
-#ifdef STARPU_USE_MIC
 static COIENGINE mic_handles[STARPU_MAXMICDEVS];
 COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
 #endif
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        static void
+_starpu_init_mpi_config (struct _starpu_machine_config *config,
+                struct starpu_conf *user_conf,
+                unsigned mpi_idx)
+{
+        struct _starpu_machine_topology *topology = &config->topology;
+
+        topology->nhwmpicores[mpi_idx] = 0;
+
+        _starpu_init_mpi_topology (config, mpi_idx);
+
+        int nmpicores;
+        nmpicores = starpu_get_env_number("STARPU_NMPIMSTHREADS");
+
+        if (nmpicores == -1)
+        {
+                /* Nothing was specified, so let's use the number of
+                 * detected mpi cores. ! */
+                nmpicores = topology->nhwmpicores[mpi_idx];
+        }
+        else
+        {
+                if ((unsigned) nmpicores > topology->nhwmpicores[mpi_idx])
+                {
+                        /* The user requires more MPI cores than there is available */
+                        fprintf(stderr,
+                                        "# Warning: %d MPI cores requested. Only %d available.\n",
+                                        nmpicores, topology->nhwmpicores[mpi_idx]);
+                        nmpicores = topology->nhwmpicores[mpi_idx];
+                }
+        }
+
+        topology->nmpicores[mpi_idx] = nmpicores;
+        STARPU_ASSERT_MSG(topology->nmpicores[mpi_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
+                        "topology->nmpicores[mpi_idx(%d)] (%d) + topology->nworkers (%d) <= STARPU_NMAXWORKERS (%d)",
+                        mpi_idx, topology->nmpicores[mpi_idx], topology->nworkers, STARPU_NMAXWORKERS);
+
+        mpi_worker_set[mpi_idx].workers = &config->workers[topology->nworkers];
+        unsigned mpicore_id;
+        for (mpicore_id = 0; mpicore_id < topology->nmpicores[mpi_idx]; mpicore_id++)
+        {
+                int worker_idx = topology->nworkers + mpicore_id;
+                config->workers[worker_idx].set = &mpi_worker_set[mpi_idx];
+                config->workers[worker_idx].arch = STARPU_MPI_WORKER;
+                _STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
+                config->workers[worker_idx].perf_arch.ndevices = 1;
+                config->workers[worker_idx].perf_arch.devices[0].type = STARPU_MPI_WORKER;
+                config->workers[worker_idx].perf_arch.devices[0].devid = mpi_idx;
+                config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
+                config->workers[worker_idx].devid = mpi_idx;
+                config->workers[worker_idx].subworkerid = mpicore_id;
+                config->workers[worker_idx].worker_mask = STARPU_MPI_MS;
+                config->worker_mask |= STARPU_MPI_MS;
+        }
+
+        topology->nworkers += topology->nmpicores[mpi_idx];
+}  
+#endif
+
 static void
 _starpu_init_mp_config (struct _starpu_machine_config *config,
-			struct starpu_conf *user_conf)
+			struct starpu_conf *user_conf, int no_mp_config)
 {
 	/* Discover and configure the mp topology. That means:
 	 * - discover the number of mp nodes;
@@ -890,20 +982,20 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 
 	struct _starpu_machine_topology *topology = &config->topology;
 
-	// We currently only support MIC at this level.
 #ifdef STARPU_USE_MIC
-
-	/* Discover and initialize the number of MIC nodes through the mp
-	 * infrastructure. */
-	unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
-
-	int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
-	if (reqmicdevices == -1 && user_conf)
-		reqmicdevices = user_conf->nmic;
-	if (reqmicdevices == -1)
-		/* Nothing was specified, so let's use the number of
-		 * detected mic devices. ! */
-		reqmicdevices = nhwmicdevices;
+    if (!no_mp_config)
+    {
+        /* Discover and initialize the number of MIC nodes through the mp
+         * infrastructure. */
+        unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
+
+        int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
+        if (reqmicdevices == -1 && user_conf)
+            reqmicdevices = user_conf->nmic;
+        if (reqmicdevices == -1)
+            /* Nothing was specified, so let's use the number of
+             * detected mic devices. ! */
+            reqmicdevices = nhwmicdevices;
 
 	if (reqmicdevices != -1)
 	{
@@ -915,18 +1007,67 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 		}
 	}
 
-	topology->nmicdevices = 0;
-	unsigned i;
-	for (i = 0; i < (unsigned) reqmicdevices; i++)
-		if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
-			topology->nmicdevices++;
+        topology->nmicdevices = 0;
+        unsigned i;
+        for (i = 0; i < (unsigned) reqmicdevices; i++)
+                if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
+                        topology->nmicdevices++;
 
 
-	for (i = 0; i < topology->nmicdevices; i++)
-		_starpu_init_mic_config (config, user_conf, i);
+        for (i = 0; i < topology->nmicdevices; i++)
+                _starpu_init_mic_config (config, user_conf, i);
+    }
+#endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+    {
+            /* Discover and initialize the number of MPI nodes through the mp
+             * infrastructure. */
+            unsigned nhwmpidevices = _starpu_mpi_src_get_device_count();
+
+            int reqmpidevices = starpu_get_env_number("STARPU_NMPI_MS");
+            if (reqmpidevices == -1 && user_conf)
+                    reqmpidevices = user_conf->nmpi_ms;
+            if (reqmpidevices == -1)
+                    /* Nothing was specified, so let's use the number of
+                     * detected mpi devices. ! */
+                    reqmpidevices = nhwmpidevices;
+
+            if (reqmpidevices != -1)
+            {
+                    if ((unsigned) reqmpidevices > nhwmpidevices)
+                    {
+                            /* The user requires more MPI devices than there is available */
+                            fprintf(stderr,
+                                            "# Warning: %d MPI Master-Slave devices requested. Only %d available.\n",
+                                            reqmpidevices, nhwmpidevices);
+                            reqmpidevices = nhwmpidevices;
+                    }
+            }
+
+            topology->nmpidevices = reqmpidevices;
+
+            /* if user don't want to use MPI slaves, we close the slave processes */
+            if (no_mp_config && topology->nmpidevices == 0)
+            {
+                    _starpu_mpi_common_mp_deinit();
+                    exit(0);
+            }
+
+            if (!no_mp_config)
+            {
+                    unsigned i;
+                    for (i = 0; i < topology->nmpidevices; i++)
+                            mpi_ms_nodes[i] = _starpu_mp_common_node_create(STARPU_NODE_MPI_SOURCE, i);
+
+
+                    for (i = 0; i < topology->nmpidevices; i++)
+                            _starpu_init_mpi_config (config, user_conf, i);
+            }
+    }
 #endif
 }
 
+#ifdef STARPU_USE_MIC
 static void
 _starpu_deinit_mic_node (unsigned mic_idx)
 {
@@ -936,6 +1077,17 @@ _starpu_deinit_mic_node (unsigned mic_idx)
 
 	_starpu_mp_common_node_destroy(mic_nodes[mic_idx]);
 }
+#endif
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+static void _starpu_deinit_mpi_node(int devid)
+{
+        _starpu_mp_common_send_command(mpi_ms_nodes[devid], STARPU_MP_COMMAND_EXIT, NULL, 0);                          
+
+        _starpu_mp_common_node_destroy(mpi_ms_nodes[devid]);
+}
+#endif
+
 
 static void
 _starpu_deinit_mp_config (struct _starpu_machine_config *config)
@@ -943,11 +1095,16 @@ _starpu_deinit_mp_config (struct _starpu_machine_config *config)
 	struct _starpu_machine_topology *topology = &config->topology;
 	unsigned i;
 
+#ifdef STARPU_USE_MIC
 	for (i = 0; i < topology->nmicdevices; i++)
 		_starpu_deinit_mic_node (i);
 	_starpu_mic_clear_kernels();
-}
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	for (i = 0; i < topology->nmpidevices; i++)
+		_starpu_deinit_mpi_node (i);
+#endif
+}
 
 #ifdef STARPU_HAVE_HWLOC
 static unsigned
@@ -1006,6 +1163,10 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 	for (i = 0; i < (int) (sizeof(mic_worker_set)/sizeof(mic_worker_set[0])); i++)
 		mic_worker_set[i].workers = NULL;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	for (i = 0; i < (int) (sizeof(mpi_worker_set)/sizeof(mpi_worker_set[0])); i++)
+		mpi_worker_set[i].workers = NULL;
+#endif
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	int ncuda = config->conf.ncuda;
@@ -1243,7 +1404,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
 		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
 		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
-		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncores = 1;
 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
 		config->workers[topology->nworkers + sccdev].devid = devid;
 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
@@ -1256,12 +1417,8 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 	topology->nworkers += topology->nsccdevices;
 #endif /* STARPU_USE_SCC */
 
-
-	/* Unless not requested, we need to complete configuration with the
-	 * ones of the mp nodes. */
-#ifdef STARPU_USE_MIC
-	if (! no_mp_config)
-	    _starpu_init_mp_config (config, &config->conf);
+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
+	    _starpu_init_mp_config (config, &config->conf, no_mp_config);
 #endif
 
 /* we put the CPU section after the accelerator : in case there was an
@@ -1278,7 +1435,17 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 			for (j = 0; j < STARPU_MAXMICDEVS; j++)
 				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
 
-			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
+            unsigned mpi_ms_busy_cpus = 0;
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+#ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+            for (j = 0; j < STARPU_MAXMPIDEVS; j++)
+                    mpi_ms_busy_cpus += (topology->nmpicores[j] ? 1 : 0);
+#else
+            mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
+#endif
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
+			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus + topology->ncudagpus
 				+ topology->nopenclgpus + topology->nsccdevices;
 
 			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
@@ -1563,6 +1730,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned mic_memory_nodes[STARPU_MAXMICDEVS];
 	unsigned mic_bindid[STARPU_MAXMICDEVS];
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	unsigned mpi_init[STARPU_MAXMPIDEVS] = { };
+	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
+	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
+#endif
 	unsigned bindid;
 
 	for (bindid = 0; bindid < config->nbindid; bindid++)
@@ -1579,7 +1751,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 		struct _starpu_worker *workerarg = &config->workers[worker];
 		unsigned devid = workerarg->devid;
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 		/* Perhaps the worker has some "favourite" bindings  */
 		int *preferred_binding = NULL;
 		int npreferred = 0;
@@ -1610,6 +1782,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
 				_starpu_memory_node_add_nworkers(memory_node);
+
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
+
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				if (memory_node != STARPU_MAIN_RAM)
@@ -1701,6 +1877,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					}
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
+
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
@@ -1740,6 +1919,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 #endif /* SIMGRID */
 				}
 				_starpu_memory_node_add_nworkers(memory_node);
+
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
@@ -1771,6 +1953,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				}
 				workerarg->bindid = mic_bindid[devid];
 				_starpu_memory_node_add_nworkers(memory_node);
+
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
@@ -1787,13 +1972,59 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
 				memory_node = ram_memory_node;
 				_starpu_memory_node_add_nworkers(memory_node);
+
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
 #ifdef STARPU_SIMGRID
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
 #endif
 			}
 				break;
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+			case STARPU_MPI_WORKER:
+			{
+				if (mpi_init[devid])
+				{
+					memory_node = mpi_memory_nodes[devid];
+				}
+				else
+				{
+					mpi_init[devid] = 1;
+					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
+					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
+					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
+					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+
+				}
+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+                                /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
+                                unsigned findworker;
+                                for (findworker = 0; findworker < worker; findworker++)
+                                {
+                                        struct _starpu_worker *findworkerarg = &config->workers[findworker];
+                                        if (findworkerarg->arch == STARPU_MPI_WORKER)
+                                        {
+                                                _starpu_worker_drives_memory_node(workerarg->workerid, findworkerarg->memory_node);
+                                                _starpu_worker_drives_memory_node(findworkerarg->workerid, memory_node);
+                                        }
+                                }
+#endif
+                
+				workerarg->bindid = mpi_bindid[devid];
+				_starpu_memory_node_add_nworkers(memory_node);
+#ifdef STARPU_SIMGRID
+				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
+				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
 #endif
+				break;
+			}
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
 
 			default:
 				STARPU_ABORT();
@@ -1885,6 +2116,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 	config->opencl_nodeid = -1;
 	config->mic_nodeid = -1;
 	config->scc_nodeid = -1;
+        config->mpi_nodeid = -1;
 	for (i = 0; i < starpu_worker_get_count(); i++)
 	{
 		switch (starpu_worker_get_type(i))
@@ -1919,6 +2151,12 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 				else if (config->scc_nodeid != (int) starpu_worker_get_memory_node(i))
 					config->scc_nodeid = -2;
 				break;
+			case STARPU_MPI_WORKER:
+				if (config->mpi_nodeid == -1)
+					config->mpi_nodeid = starpu_worker_get_memory_node(i);
+				else if (config->mpi_nodeid != (int) starpu_worker_get_memory_node(i))
+					config->mpi_nodeid = -2;
+				break;
 			case STARPU_ANY_WORKER:
 				STARPU_ASSERT(0);
 		}
@@ -1929,7 +2167,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
 void _starpu_destroy_topology(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
-#ifdef STARPU_USE_MIC
+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 	_starpu_deinit_mp_config(config);
 #endif
 

+ 155 - 9
src/core/workers.c

@@ -40,6 +40,7 @@
 #include <top/starpu_top_core.h>
 #include <drivers/mp_common/sink_common.h>
 #include <drivers/scc/driver_scc_common.h>
+#include <drivers/mpi/driver_mpi_common.h>
 
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cuda/driver_cuda.h>
@@ -142,6 +143,10 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mic_funcs[impl] != NULL)
 					test_implementation = 1;
 				break;
+                        case STARPU_MPI_WORKER:
+                                if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mpi_ms_funcs[impl] != NULL)
+                                        test_implementation = 1;
+                                break;
 			case STARPU_SCC_WORKER:
 				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->scc_funcs[impl] != NULL)
 					test_implementation = 1;
@@ -205,6 +210,11 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_MIC_WORKER))
 		return 1;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	if ((task->cl->where & STARPU_MPI_MS) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_MPI_WORKER))
+		return 1;
+#endif
 #ifdef STARPU_USE_SCC
 	if ((task->cl->where & STARPU_SCC) &&
 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
@@ -280,6 +290,13 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
 		return func != NULL || func_name != NULL;
 	}
+	case STARPU_MPI_WORKER:
+	{
+		starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(cl, nimpl);
+		const char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+
+		return func != NULL || func_name != NULL;
+	}
 	case STARPU_SCC_WORKER:
 	{
 		starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(cl, nimpl);
@@ -521,6 +538,9 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	starpu_pthread_wait_init(&workerarg->wait);
 	starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_task_queue[workerarg->workerid]);
 #endif
+        workerarg->task_sending = NULL;
+        workerarg->nb_buffers_sent = 0;
+
 	workerarg->first_task = 0;
 	workerarg->ntasks = 0;
 	/* set initialized by topology.c */
@@ -591,7 +611,6 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 	_starpu_fxt_register_thread(worker->bindid);
 	_starpu_worker_start(worker, fut_key, sync);
 #endif
-
 	_starpu_memory_node_set_local_key(&worker->memory_node);
 
 	_starpu_set_local_worker_key(worker);
@@ -639,7 +658,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		unsigned devid = workerarg->devid;
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
 		struct _starpu_worker_set *worker_set = workerarg->set;
 #endif
 
@@ -808,13 +827,91 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
 				break;
+#endif /* STARPU_USE_SCC */
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+			case STARPU_MPI_WORKER:
+				/* We spawn only one thread
+				 * per MPI device, which will control all MPI
+				 * workers of this device. (by using a worker set). */
+				if (worker_set->workers != workerarg)
+					break;
+
+				worker_set->nworkers = pconfig->topology.nmpicores[devid];
+
+				worker_set->set_is_initialized = 0;
+
+#ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+                /* if MPI has multiple threads supports
+                 * we launch 1 thread per device 
+                 * else 
+                 * we launch one thread for all devices
+                 */
+				STARPU_PTHREAD_CREATE_ON(
+						workerarg->name,
+						&worker_set->worker_thread,
+						NULL,
+						_starpu_mpi_src_worker,
+						worker_set,
+						_starpu_simgrid_get_host_by_worker(workerarg));
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
 
+				STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+				while (!worker_set->set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
+								  &worker_set->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
+
+				worker_set->started = 1;
+#endif /* STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD */
+
+				break;
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
 			default:
 				STARPU_ABORT();
 		}
 	}
 
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+        if (pconfig->topology.nmpidevices > 0)
+        {
+                struct _starpu_worker_set * worker_set_zero = &mpi_worker_set[0];
+                struct _starpu_worker * worker_zero = &worker_set_zero->workers[0];
+                STARPU_PTHREAD_CREATE_ON(
+                                worker_zero->name,
+                                &worker_set_zero->worker_thread,
+                                NULL,
+                                _starpu_mpi_src_worker,
+                                &mpi_worker_set,
+                                _starpu_simgrid_get_host_by_worker(worker_zero));
+
+                /* We use the first worker to know if everything are finished */
+#ifdef STARPU_USE_FXT
+                STARPU_PTHREAD_MUTEX_LOCK(&worker_zero->mutex);
+                while (!worker_zero->worker_is_running)
+                        STARPU_PTHREAD_COND_WAIT(&worker_zero->started_cond, &worker_zero->mutex);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_zero->mutex);
+#endif
+
+                STARPU_PTHREAD_MUTEX_LOCK(&worker_set_zero->mutex);
+                while (!worker_set_zero->set_is_initialized)
+                        STARPU_PTHREAD_COND_WAIT(&worker_set_zero->ready_cond,
+                                        &worker_set_zero->mutex);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set_zero->mutex);
+
+                worker_set_zero->started = 1;
+                worker_set_zero->worker_thread = mpi_worker_set[0].worker_thread;
+
+        }
+
+#endif
+
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -869,6 +966,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				break;
 #endif
 			case STARPU_MIC_WORKER:
+                        case STARPU_MPI_WORKER:
 				/* Already waited above */
 				break;
 			case STARPU_SCC_WORKER:
@@ -911,6 +1009,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
 	conf->nmic = starpu_get_env_number("STARPU_NMIC");
 	conf->nscc = starpu_get_env_number("STARPU_NSCC");
+	conf->nmpi_ms = starpu_get_env_number("STARPU_NMPI_MS");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
 	conf->mic_sink_program_path = starpu_getenv("STARPU_MIC_PROGRAM_PATH");
@@ -926,6 +1025,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
 	conf->use_explicit_workers_scc_deviceid = 0; /* TODO */
+	conf->use_explicit_workers_mpi_deviceid = 0; /* TODO */
 
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 	if (conf->single_combined_worker == -1)
@@ -963,6 +1063,14 @@ int starpu_conf_init(struct starpu_conf *conf)
 		conf->disable_asynchronous_mic_copy = 0;
 #endif
 
+#if defined(STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY)
+    conf->disable_asynchronous_mpi_ms_copy = 1;
+#else
+    conf->disable_asynchronous_mpi_ms_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY");
+    if(conf->disable_asynchronous_mpi_ms_copy == -1)
+        conf->disable_asynchronous_mpi_ms_copy = 0;
+#endif
+
 	/* 64MiB by default */
 	conf->trace_buffer_size = starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64) << 20;
 	return 0;
@@ -1007,6 +1115,7 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY", &conf->disable_asynchronous_cuda_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY", &conf->disable_asynchronous_opencl_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY", &conf->disable_asynchronous_mic_copy);
+	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY", &conf->disable_asynchronous_mpi_ms_copy);
 }
 
 struct starpu_tree* starpu_workers_get_tree(void)
@@ -1109,6 +1218,18 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		setenv("STARPU_SINK", "STARPU_SCC", 1);
 #	endif
 
+#       ifdef STARPU_USE_MPI_MASTER_SLAVE
+        if (_starpu_mpi_common_mp_init() == -ENODEV)
+        {
+                initialized = UNINITIALIZED;
+                return -ENODEV;
+        }
+
+        /* In MPI case we look at the rank to know if we are a sink */
+        if (!_starpu_mpi_common_is_src_node())
+                setenv("STARPU_SINK", "STARPU_MPI_MS", 1);
+#       endif
+
 	/* If StarPU was configured to use MP sinks, we have to control the
 	 * kind on node we are running on : host or sink ? */
 	if (starpu_getenv("STARPU_SINK"))
@@ -1243,7 +1364,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	/* Depending on whether we are a MP sink or not, we must build the
 	 * topology with MP nodes or not. */
 	ret = _starpu_build_topology(&_starpu_config, is_a_sink);
-	if (ret)
+    /* sink doesn't exit even if no worker discorvered */
+	if (ret && !is_a_sink)
 	{
 		starpu_perfmodel_free_sampling_directories();
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
@@ -1255,6 +1377,11 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		if (_starpu_scc_common_is_mp_initialized())
 			_starpu_scc_src_mp_deinit();
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+                if (_starpu_mpi_common_is_mp_initialized())
+                        _starpu_mpi_common_mp_deinit();
+#endif
+
 		initialized = UNINITIALIZED;
 		/* Let somebody else try to do it */
 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
@@ -1602,6 +1729,10 @@ void starpu_shutdown(void)
 	if (_starpu_scc_common_is_mp_initialized())
 		_starpu_scc_src_mp_deinit();
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+    if (_starpu_mpi_common_is_mp_initialized())
+        _starpu_mpi_common_mp_deinit();
+#endif 
 	_starpu_print_idle_time();
 	_STARPU_DEBUG("Shutdown finished\n");
 
@@ -1646,12 +1777,16 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 		case STARPU_SCC_WORKER:
 			return _starpu_config.topology.nsccdevices;
 
-		case STARPU_ANY_WORKER:
-			return _starpu_config.topology.ncpus+
-			       _starpu_config.topology.ncudagpus+
-			       _starpu_config.topology.nopenclgpus+
-			       _starpu_config.topology.nmicdevices+
-			       _starpu_config.topology.nsccdevices;
+                case STARPU_MPI_WORKER:
+                        return _starpu_config.topology.nmpidevices;
+
+                case STARPU_ANY_WORKER:
+                        return _starpu_config.topology.ncpus+
+                                _starpu_config.topology.ncudagpus+
+                                _starpu_config.topology.nopenclgpus+
+                                _starpu_config.topology.nmicdevices+
+                                _starpu_config.topology.nsccdevices+
+                                _starpu_config.topology.nmpidevices;
 		default:
 			return -EINVAL;
 	}
@@ -1697,6 +1832,11 @@ int starpu_asynchronous_mic_copy_disabled(void)
 	return _starpu_config.conf.disable_asynchronous_mic_copy;
 }
 
+int starpu_asynchronous_mpi_ms_copy_disabled(void)
+{
+        return _starpu_config.conf.disable_asynchronous_mpi_ms_copy;
+}
+
 unsigned starpu_mic_worker_get_count(void)
 {
 	int i = 0, count = 0;
@@ -1712,6 +1852,11 @@ unsigned starpu_scc_worker_get_count(void)
 	return _starpu_config.topology.nsccdevices;
 }
 
+unsigned starpu_mpi_ms_worker_get_count(void)
+{
+        return _starpu_config.topology.nmpidevices;
+}
+
 /* When analyzing performance, it is useful to see what is the processing unit
  * that actually performed the task. This function returns the id of the
  * processing unit actually executing it, therefore it makes no sense to use it
@@ -2239,6 +2384,7 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
 	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
 	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
+        if (type == STARPU_MPI_WORKER) return "STARPU_MPI_WORKER";
 	if (type == STARPU_SCC_WORKER) return "STARPU_SCC_WORKER";
 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
 	return "STARPU_unknown_WORKER";

+ 29 - 1
src/core/workers.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -47,6 +47,9 @@
 #include <drivers/scc/driver_scc_source.h>
 #endif
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+#include <drivers/mpi/driver_mpi_source.h>
+#endif
 
 #include <drivers/cpu/driver_cpu.h>
 
@@ -112,6 +115,8 @@ LIST_TYPE(_starpu_worker,
 
 	unsigned spinning_backoff ; /* number of cycles to pause when spinning  */
 
+        unsigned nb_buffers_sent; /* number of piece of data already send to remote side */
+        struct starpu_task *task_sending; /* The buffers of this task are being sent */
 
 	/* indicate whether the workers shares tasks lists with other workers*/
 	/* in this case when removing him from a context it disapears instantly */
@@ -180,6 +185,10 @@ struct _starpu_worker_set
 	unsigned set_is_initialized;
 };
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+extern struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
+#endif
+
 struct _starpu_machine_topology
 {
 	/* Total number of workers. */
@@ -222,6 +231,11 @@ struct _starpu_machine_topology
 	 */
 	unsigned nhwscc;
 
+	/* Total number of MPI nodes, as detected. May be different
+	 * from the actual number of node workers.
+	 */
+	unsigned nhwmpi;
+
 	/* Actual number of CPU workers used by StarPU. */
 	unsigned ncpus;
 
@@ -234,6 +248,13 @@ struct _starpu_machine_topology
 	/* Actual number of SCC workers used by StarPU. */
 	unsigned nsccdevices;
 
+	/* Actual number of MPI workers used by StarPU. */
+	unsigned nmpidevices;
+        unsigned nhwmpidevices;
+
+	unsigned nhwmpicores[STARPU_MAXMPIDEVS]; // Each MPI node has its set of cores.
+	unsigned nmpicores[STARPU_MAXMPIDEVS];
+
 	/* Topology of MP nodes (mainly MIC and SCC) as well as necessary
 	 * objects to communicate with them. */
 	unsigned nhwmicdevices;
@@ -283,6 +304,8 @@ struct _starpu_machine_topology
 	 * are taken in ID order.
 	 */
 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
+
+	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
 };
 
 struct _starpu_machine_config
@@ -309,6 +332,9 @@ struct _starpu_machine_config
 	/* Which SCC do we use? */
 	int current_scc_deviceid;
 
+	/* Which MPI do we use? */
+	int current_mpi_deviceid;
+
 	/* Memory node for cpus, if only one */
 	int cpus_nodeid;
 	/* Memory node for CUDA, if only one */
@@ -319,6 +345,8 @@ struct _starpu_machine_config
 	int mic_nodeid;
 	/* Memory node for SCC, if only one */
 	int scc_nodeid;
+	/* Memory node for MPI, if only one */
+	int mpi_nodeid;
 
 	/* Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */

+ 11 - 8
src/datawizard/coherency.c

@@ -146,7 +146,8 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
 			if (starpu_node_get_kind(i) == STARPU_CPU_RAM || 
 			    starpu_node_get_kind(i) == STARPU_SCC_RAM ||
-			    starpu_node_get_kind(i) == STARPU_SCC_SHM)
+			    starpu_node_get_kind(i) == STARPU_SCC_SHM ||
+                            starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
 				i_ram = i;
 			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)			
 				i_disk = i;
@@ -259,6 +260,11 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 		case STARPU_MIC_RAM:
 			/* TODO: We don't handle direct MIC-MIC transfers yet */
 			return 0;
+                case STARPU_MPI_MS_RAM:
+                {
+                        enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
+                        return kind == STARPU_MPI_MS_RAM;
+                }
 		case STARPU_SCC_RAM:
 			return 1;
 		default:
@@ -717,14 +723,13 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
-	unsigned local_node = _starpu_memory_node_get_local_key();
         _STARPU_LOG_IN();
 
 	int cpt = 0;
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(local_node, 1);
+		_starpu_datawizard_progress(1);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -808,12 +813,11 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	if ((wt_mask & ~(1<<memory_node)))
 		_starpu_write_through_data(handle, memory_node, wt_mask);
 
-	unsigned local_node = _starpu_memory_node_get_local_key();
 	int cpt = 0;
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(local_node, 1);
+		_starpu_datawizard_progress(1);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -831,12 +835,11 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
 static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
 {
-	unsigned local_node = _starpu_memory_node_get_local_key();
 	int cpt = 0;
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(local_node, 1);
+		_starpu_datawizard_progress(1);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);
@@ -909,7 +912,7 @@ int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned n
 	return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
 }
 
-static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
+struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
 {
 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
 	{

+ 1 - 0
src/datawizard/coherency.h

@@ -324,5 +324,6 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle);
 void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle);
 
 void _starpu_data_set_unregister_hook(starpu_data_handle_t handle, _starpu_data_handle_unregister_hook func);
+struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node);
 
 #endif // __COHERENCY__H__

+ 124 - 0
src/datawizard/copy_driver.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013, 2016  CNRS
+ * Copyright (C) 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,6 +23,9 @@
 #include <datawizard/datastats.h>
 #include <datawizard/memory_nodes.h>
 #include <drivers/disk/driver_disk.h>
+#include <drivers/mpi/driver_mpi_sink.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mpi/driver_mpi_common.h>
 #include <common/fxt.h>
 #include "copy_driver.h"
 #include "memalloc.h"
@@ -420,6 +424,79 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 	/* TODO: MIC -> MIC */
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MPI_MS_RAM):
+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
+                                !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
+                {
+                        /* this is not associated to a request so it's synchronous */
+                        STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
+                        if (copy_methods->ram_to_mpi_ms)
+                                copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+                        else
+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+                }
+                else
+                {
+                        req->async_channel.type = STARPU_MPI_MS_RAM;
+                        if(copy_methods->ram_to_mpi_ms_async)
+                                ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        else
+                        {
+                                STARPU_ASSERT(copy_methods->any_to_any);
+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        }
+                }
+                break;
+
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_CPU_RAM):
+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
+                                !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
+                {
+                        /* this is not associated to a request so it's synchronous */
+                        STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
+                        if (copy_methods->mpi_ms_to_ram)
+                                copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
+                        else
+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+                }
+                else
+                {
+                        req->async_channel.type = STARPU_MPI_MS_RAM;
+                        if(copy_methods->mpi_ms_to_ram_async)
+                                ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        else
+                        {
+                                STARPU_ASSERT(copy_methods->any_to_any);
+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        }
+                }
+                break;
+
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_MPI_MS_RAM):
+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
+                                !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
+                {
+                        /* this is not associated to a request so it's synchronous */
+                        STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
+                        if (copy_methods->mpi_ms_to_mpi_ms)
+                                copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+                        else
+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+                }
+                else
+                {
+                        req->async_channel.type = STARPU_MPI_MS_RAM;
+                        if(copy_methods->mpi_ms_to_mpi_ms_async)
+                                ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        else
+                        {
+                                STARPU_ASSERT(copy_methods->any_to_any);
+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+                        }
+                }
+                break;
+#endif
 #ifdef STARPU_USE_SCC
 		/* SCC RAM associated to the master process is considered as
 		 * the main memory node. */
@@ -660,6 +737,43 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				(void*) (dst + dst_offset), dst_node,
 				size);
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_MPI_MS_RAM):
+                if (async_data)
+                        return _starpu_mpi_copy_ram_to_mpi_async(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size, async_data);
+                else
+                        return _starpu_mpi_copy_ram_to_mpi_sync(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size);
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_CPU_RAM):
+                if (async_data)
+                        return _starpu_mpi_copy_mpi_to_ram_async(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size, async_data);
+                else
+                        return _starpu_mpi_copy_mpi_to_ram_sync(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size);
+
+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_MPI_MS_RAM):
+                if (async_data)
+                        return _starpu_mpi_copy_sink_to_sink_async(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size, async_data);
+                else
+                        return _starpu_mpi_copy_sink_to_sink_sync(
+                                        (void*) (src + src_offset), src_node,
+                                        (void*) (dst + dst_offset), dst_node,
+                                        size);
+#endif
+
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
 	{
 		return _starpu_disk_copy_src_to_disk(
@@ -736,6 +850,11 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
 		break;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        case STARPU_MPI_MS_RAM:
+                _starpu_mpi_common_wait_event(async_channel);
+                break;
+#endif
 	case STARPU_MAIN_RAM:
 		starpu_disk_wait_request(async_channel);
 		break;
@@ -800,6 +919,11 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
 		break;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        case STARPU_MPI_MS_RAM:
+                success = _starpu_mpi_common_test_event(async_channel);
+                break;
+#endif
 	case STARPU_DISK_RAM:
 		success = starpu_disk_test_request(async_channel);
 		break;

+ 30 - 3
src/datawizard/copy_driver.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2012-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013, 2015  CNRS
+ * Copyright (C) 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +24,7 @@
 #endif
 
 #include <common/config.h>
+#include <common/list.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
@@ -34,6 +36,10 @@
 #include <starpu_opencl.h>
 #endif
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+#include <mpi.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -54,6 +60,18 @@ struct _starpu_mic_async_event
 };
 #endif
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+LIST_TYPE(_starpu_mpi_ms_event_request,
+        MPI_Request request;
+);
+
+struct _starpu_mpi_ms_async_event
+{
+        int is_sender;
+        struct _starpu_mpi_ms_event_request_list * requests;
+};
+#endif
+
 struct _starpu_disk_async_event
 {
 	unsigned memory_node;
@@ -73,21 +91,30 @@ union _starpu_async_channel_event
 	};
 #endif
 #ifdef STARPU_USE_CUDA
-	cudaEvent_t cuda_event;
+        cudaEvent_t cuda_event;
 #endif
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        struct _starpu_mpi_ms_async_event mpi_ms_event;
+#endif
 #ifdef STARPU_USE_MIC
-	struct _starpu_mic_async_event mic_event;
+        struct _starpu_mic_async_event mic_event;
 #endif
-	struct _starpu_disk_async_event disk_event;
+        struct _starpu_disk_async_event disk_event;
 };
 
 struct _starpu_async_channel
 {
 	union _starpu_async_channel_event event;
 	enum starpu_node_kind type;
+        /* Which node to polling when needing ACK msg */
+        struct _starpu_mp_node *polling_node_sender;
+        struct _starpu_mp_node *polling_node_receiver;
+        /* Used to know if the acknowlegdment msg is arrived from sinks */
+        volatile int starpu_mp_common_finished_sender; 
+        volatile int starpu_mp_common_finished_receiver; 
 };
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);

+ 10 - 2
src/datawizard/data_request.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -152,6 +153,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	r->dst_replicate = dst_replicate;
 	r->mode = mode;
 	r->async_channel.type = STARPU_UNUSED;
+        r->async_channel.starpu_mp_common_finished_sender = 0;
+        r->async_channel.starpu_mp_common_finished_receiver = 0;
+        r->async_channel.polling_node_sender = NULL;
+        r->async_channel.polling_node_receiver = NULL;
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        r->async_channel.event.mpi_ms_event.requests = NULL;
+#endif
 	if (handling_node == -1)
 		handling_node = STARPU_MAIN_RAM;
 	r->handling_node = handling_node;
@@ -206,9 +214,9 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 	int do_delete = 0;
 	int completed;
 
+#ifdef STARPU_SIMGRID
 	unsigned local_node = _starpu_memory_node_get_local_key();
 
-#ifdef STARPU_SIMGRID
 	starpu_pthread_wait_t wait;
 
 	starpu_pthread_wait_init(&wait);
@@ -244,7 +252,7 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 #endif
 #endif
 
-		_starpu_datawizard_progress(local_node, may_alloc);
+		_starpu_datawizard_progress(may_alloc);
 
 #ifdef STARPU_SIMGRID
 		starpu_pthread_wait_wait(&wait);

+ 20 - 3
src/datawizard/datawizard.c

@@ -19,13 +19,14 @@
 #include <common/config.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/memalloc.h>
+#include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <core/progress_hook.h>
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
 #endif
 
-int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
+int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
 {
 	int ret = 0;
 
@@ -63,7 +64,23 @@ int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsig
 	return ret;
 }
 
-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
+int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 {
-	__starpu_datawizard_progress(memory_node, may_alloc, 1);
+        int current_worker_id = starpu_worker_get_id();
+        unsigned memnode;
+
+        int ret = 0;
+
+        for (memnode = 0; memnode < STARPU_MAXNODES; memnode++)
+        {
+                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
+                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
+        }
+
+        return ret;
+}
+
+void _starpu_datawizard_progress(unsigned may_alloc)
+{
+        __starpu_datawizard_progress(may_alloc, 1);
 }

+ 3 - 2
src/datawizard/datawizard.h

@@ -33,7 +33,8 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
-int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc);
+int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
+int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
+void _starpu_datawizard_progress(unsigned may_alloc);
 
 #endif // __DATAWIZARD_H__

+ 11 - 0
src/datawizard/malloc.c

@@ -592,6 +592,12 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 				addr = 0;
 			break;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+		case STARPU_MPI_MS_RAM:
+			if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
+				addr = 0;
+			break;
+#endif
 #ifdef STARPU_USE_SCC
 		case STARPU_SCC_RAM:
 			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
@@ -693,6 +699,11 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 			_starpu_mic_free_memory((void*) addr, size, dst_node);
 			break;
 #endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        case STARPU_MPI_MS_RAM:
+            _starpu_mpi_source_free_memory((void*) addr, dst_node);
+            break;
+#endif
 #ifdef STARPU_USE_SCC
 		case STARPU_SCC_RAM:
 			_starpu_scc_free_memory((void *) addr, dst_node);

+ 1 - 1
src/datawizard/memalloc.c

@@ -1367,7 +1367,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+		_starpu_datawizard_progress(0);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);

+ 5 - 0
src/datawizard/memory_nodes.c

@@ -25,6 +25,8 @@
 #include "copy_driver.h"
 #include "memalloc.h"
 
+char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
+
 struct _starpu_memory_node_descr _starpu_descr;
 starpu_pthread_key_t _starpu_memory_node_key STARPU_ATTRIBUTE_INTERNAL;
 
@@ -92,6 +94,9 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 	case STARPU_MIC_RAM:
 		prefix = "MIC";
 		break;
+	case STARPU_MPI_MS_RAM:
+		prefix = "MPI_MS";
+		break;
 	case STARPU_SCC_RAM:
 		prefix = "SCC_RAM";
 		break;

+ 8 - 0
src/datawizard/memory_nodes.h

@@ -33,6 +33,8 @@
 #define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
 #define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
 
+extern char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
+
 struct _starpu_cond_and_mutex
 {
         starpu_pthread_cond_t *cond;
@@ -96,6 +98,12 @@ static inline void _starpu_memory_node_add_nworkers(unsigned node)
 	_starpu_descr.nworkers[node]++;
 }
 
+/* same utility as _starpu_memory_node_add_nworkers */
+static inline void _starpu_worker_drives_memory_node(unsigned worker_id, unsigned memnode)
+{
+    _starpu_worker_drives_memory[worker_id][memnode] = 1;   
+}
+
 static inline unsigned _starpu_memory_node_get_nworkers(unsigned node)
 {
 	return _starpu_descr.nworkers[node];

+ 1 - 1
src/datawizard/write_back.c

@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 				{
 					cpt++;
-					_starpu_datawizard_progress(requesting_node, 1);
+					__starpu_datawizard_progress(1, 1);
 				}
 				if (cpt == STARPU_SPIN_MAXTRY)
 					_starpu_spin_lock(&handle->header_lock);

+ 52 - 12
src/debug/traces/starpu_fxt.c

@@ -33,6 +33,7 @@
 #define CUDA_WORKER_COLORS_NB	9
 #define OPENCL_WORKER_COLORS_NB 9
 #define MIC_WORKER_COLORS_NB	9
+#define MPI_MS_WORKER_COLORS_NB	9
 #define SCC_WORKER_COLORS_NB	9
 #define OTHER_WORKER_COLORS_NB	4
 
@@ -40,6 +41,7 @@ static char *cpus_worker_colors[CPUS_WORKER_COLORS_NB] = {"/greens9/7", "/greens
 static char *cuda_worker_colors[CUDA_WORKER_COLORS_NB] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
 static char *opencl_worker_colors[OPENCL_WORKER_COLORS_NB] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
 static char *mic_worker_colors[MIC_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
+static char *mpi_ms_worker_colors[MPI_MS_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
 static char *scc_worker_colors[SCC_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
 static char *other_worker_colors[OTHER_WORKER_COLORS_NB] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
 static char *worker_colors[STARPU_NMAXWORKERS];
@@ -48,6 +50,7 @@ static unsigned opencl_index = 0;
 static unsigned cuda_index = 0;
 static unsigned cpus_index = 0;
 static unsigned mic_index = 0;
+static unsigned mpi_ms_index = 0;
 static unsigned scc_index = 0;
 static unsigned other_index = 0;
 
@@ -248,6 +251,14 @@ static void set_next_mic_worker_color(int workerid)
 	if (mic_index == MIC_WORKER_COLORS_NB) mic_index = 0;
 }
 
+static void set_next_mpi_ms_worker_color(int workerid)
+{
+	if (workerid >= STARPU_NMAXWORKERS)
+		return;
+	worker_colors[workerid] = mpi_ms_worker_colors[mpi_ms_index++];
+	if (mpi_ms_index == MPI_MS_WORKER_COLORS_NB) mpi_ms_index = 0;
+}
+
 static void set_next_scc_worker_color(int workerid)
 {
 	if (workerid >= STARPU_NMAXWORKERS)
@@ -907,6 +918,14 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			arch.devices[0].devid = devid;
 			arch.devices[0].ncores = 1;
 			break;
+		case _STARPU_FUT_MPI_KEY:
+			set_next_mpi_ms_worker_color(workerid);
+			kindstr = "mpi_ms";
+			arch.devices[0].type = STARPU_MPI_WORKER;
+			arch.devices[0].devid = devid;
+			arch.devices[0].ncores = 1;
+			break;
+			
 		case _STARPU_FUT_SCC_KEY:
 			set_next_scc_worker_color(workerid);
 			kindstr = "scc";
@@ -1522,7 +1541,7 @@ static void handle_hypervisor_end(struct fxt_ev_64 *ev, struct starpu_fxt_option
 	}
 }
 
-static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
+static void handle_worker_status_on_tid(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
 {
 	int worker;
 	worker = find_worker_id(ev->param[1]);
@@ -1535,6 +1554,19 @@ static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], newstatus, "Runtime");
 }
 
+static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
+{
+	int worker;
+	worker = ev->param[1];
+	if (worker < 0)
+		return;
+
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], newstatus);
+	if (trace_file)
+		recfmt_worker_set_state(get_event_time_stamp(ev, options), ev->param[1], newstatus, "Runtime");
+}
+
 static double last_sleep_start[STARPU_NMAXWORKERS];
 
 static void handle_worker_scheduling_start(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -2618,22 +2650,30 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				break;
 
 			/* check the memory transfer overhead */
-			case _STARPU_FUT_START_FETCH_INPUT:
-				handle_worker_status(&ev, options, "Fi");
+			case _STARPU_FUT_START_FETCH_INPUT_ON_TID:
+				handle_worker_status_on_tid(&ev, options, "Fi");
 				break;
-			case _STARPU_FUT_START_PUSH_OUTPUT:
-				handle_worker_status(&ev, options, "Po");
+			case _STARPU_FUT_START_PUSH_OUTPUT_ON_TID:
+				handle_worker_status_on_tid(&ev, options, "Po");
 				break;
-			case _STARPU_FUT_START_PROGRESS:
-				handle_worker_status(&ev, options, "P");
+			case _STARPU_FUT_START_PROGRESS_ON_TID:
+				handle_worker_status_on_tid(&ev, options, "P");
 				break;
-			case _STARPU_FUT_START_UNPARTITION:
-				handle_worker_status(&ev, options, "U");
+			case _STARPU_FUT_START_UNPARTITION_ON_TID:
+				handle_worker_status_on_tid(&ev, options, "U");
 				break;
+			case _STARPU_FUT_END_FETCH_INPUT_ON_TID:
+			case _STARPU_FUT_END_PROGRESS_ON_TID:
+			case _STARPU_FUT_END_PUSH_OUTPUT_ON_TID:
+			case _STARPU_FUT_END_UNPARTITION_ON_TID:
+				handle_worker_status_on_tid(&ev, options, "B");
+				break;
+
+			case _STARPU_FUT_START_FETCH_INPUT:
+				handle_worker_status(&ev, options, "Fi");
+				break;
+
 			case _STARPU_FUT_END_FETCH_INPUT:
-			case _STARPU_FUT_END_PROGRESS:
-			case _STARPU_FUT_END_PUSH_OUTPUT:
-			case _STARPU_FUT_END_UNPARTITION:
 				handle_worker_status(&ev, options, "B");
 				break;
 

+ 1 - 3
src/drivers/cpu/driver_cpu.c

@@ -241,9 +241,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 #endif
 
 	_STARPU_TRACE_START_PROGRESS(memnode);
-	res = __starpu_datawizard_progress(memnode, 1, 1);
-	if (memnode != STARPU_MAIN_RAM)
-		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+	res = __starpu_datawizard_progress(1, 1);
 	_STARPU_TRACE_END_PROGRESS(memnode);
 
 	struct _starpu_job *j;

+ 2 - 4
src/drivers/cuda/driver_cuda.c

@@ -810,16 +810,14 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	if (!idle)
 	{
 		/* Nothing ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(memnode, 1, 0);
-		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
+		__starpu_datawizard_progress(1, 0);
 		return 0;
 	}
 #endif
 
 	/* Something done, make some progress */
 	res = !idle;
-	res |= __starpu_datawizard_progress(memnode, 1, 1);
-	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+	res |= __starpu_datawizard_progress(1, 1);
 
 	/* And pull tasks */
 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);

+ 5 - 0
src/drivers/driver_common/driver_common.c

@@ -489,6 +489,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		{
 			tasks[i] = NULL;
 		}
+                /* don't push a task if we are already pushing one */
+                else if (workers[i].task_sending != NULL)
+                {
+                        tasks[i] = NULL;
+                }
 		/*else try to pop a task*/
 		else
 		{

+ 5 - 5
src/drivers/mic/driver_mic_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,7 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, co
  * care about it.
  */
 
-void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event)
 {
   if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
@@ -56,7 +56,7 @@ int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
  * care about it.
  */
 
-void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
 {
 	if ((scif_recv(node->mp_connection.mic_endpoint, msg, len, SCIF_RECV_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
@@ -65,7 +65,7 @@ void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
  */
-void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len, void * event)
 {
 	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
@@ -74,7 +74,7 @@ void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
  */
-void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len)
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len, void * event)
 {
 	if ((scif_recv(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);

+ 5 - 5
src/drivers/mic/driver_mic_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -56,13 +56,13 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, in
 
 int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
 
-void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
 
-void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
 
-void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
 
-void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
 
 void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, COIPROCESS process,
 				uint16_t local_port_number, uint16_t remote_port_number);

+ 3 - 3
src/drivers/mic/driver_mic_source.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -407,7 +407,7 @@ int _starpu_mic_copy_ram_to_mic(void *src, unsigned src_node STARPU_ATTRIBUTE_UN
 {
 	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(dst_node);
 
-	return _starpu_src_common_copy_host_to_sink(mp_node, src, dst, size);
+	return _starpu_src_common_copy_host_to_sink_sync(mp_node, src, dst, size);
 }
 
 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
@@ -417,7 +417,7 @@ int _starpu_mic_copy_mic_to_ram(void *src, unsigned src_node, void *dst, unsigne
 {
 	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(src_node);
 
-	return _starpu_src_common_copy_sink_to_host(mp_node, src, dst, size);
+	return _starpu_src_common_copy_sink_to_host_sync(mp_node, src, dst, size);
 }
 
 /* Asynchronous transfers */

+ 82 - 11
src/drivers/mp_common/mp_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,9 @@
 #include <drivers/scc/driver_scc_common.h>
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/scc/driver_scc_sink.h>
+#include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mpi/driver_mpi_sink.h>
 
 #include <common/list.h>
 
@@ -159,6 +162,8 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 		node->dt_send = _starpu_mic_common_dt_send;
 		node->dt_recv = _starpu_mic_common_dt_recv;
 
+                node->dt_test = NULL; /* Not used now */
+
 		node->get_kernel_from_job = NULL;
 		node->lookup = _starpu_mic_sink_lookup;
 		node->bind_thread = _starpu_mic_sink_bind_thread;
@@ -209,6 +214,8 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 		node->dt_send_to_device = _starpu_scc_sink_send_to_device;
 		node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
 
+                node->dt_test = NULL /* not used now */
+
 		node->get_kernel_from_job = NULL;
 		node->lookup = _starpu_scc_sink_lookup;
 		node->bind_thread = _starpu_scc_sink_bind_thread;
@@ -219,15 +226,72 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 	break;
 #endif /* STARPU_USE_SCC */
 
-#ifdef STARPU_USE_MPI
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
 	case STARPU_NODE_MPI_SOURCE:
-		STARPU_ABORT();
+        {
+                /*
+                   node->nb_mp_sinks = 
+                   node->devid = 
+                   */
+                node->peer_id = (_starpu_mpi_common_get_src_node() <= peer_id ? peer_id+1 : peer_id);
+                node->mp_connection.mpi_remote_nodeid = node->peer_id;
+
+                node->init = _starpu_mpi_source_init;
+                node->launch_workers = NULL;
+                node->deinit = _starpu_mpi_source_deinit;
+                /*     node->report_error = */
+
+                node->mp_recv_is_ready = _starpu_mpi_common_recv_is_ready;
+                node->mp_send = _starpu_mpi_common_mp_send;
+                node->mp_recv = _starpu_mpi_common_mp_recv;
+                node->dt_send = _starpu_mpi_common_send;
+                node->dt_recv = _starpu_mpi_common_recv;
+                node->dt_send_to_device = _starpu_mpi_common_send_to_device;
+                node->dt_recv_from_device = _starpu_mpi_common_recv_from_device;
+
+                node->get_kernel_from_job = _starpu_mpi_ms_src_get_kernel_from_job;
+                node->lookup = NULL;
+                node->bind_thread = NULL;
+                node->execute = NULL;
+                node->allocate = NULL;
+                node->free = NULL;
+        }
+        break;
+
+        case STARPU_NODE_MPI_SINK:
+        {
+                /*
+                   node->nb_mp_sinks = 
+                   node->devid = 
+                   */
+                node->mp_connection.mpi_remote_nodeid = _starpu_mpi_common_get_src_node();
+
+                node->init = _starpu_mpi_sink_init;
+                node->launch_workers = _starpu_mpi_sink_launch_workers;
+                node->deinit = _starpu_mpi_sink_deinit;
+                /*    node->report_error =  */
+
+                node->mp_recv_is_ready = _starpu_mpi_common_recv_is_ready;
+                node->mp_send = _starpu_mpi_common_mp_send;
+                node->mp_recv = _starpu_mpi_common_mp_recv;
+                node->dt_send = _starpu_mpi_common_send;
+                node->dt_recv = _starpu_mpi_common_recv;
+                node->dt_send_to_device = _starpu_mpi_common_send_to_device;
+                node->dt_recv_from_device = _starpu_mpi_common_recv_from_device;
+
+                node->dt_test = _starpu_mpi_common_test_event;
+
+                node->get_kernel_from_job = NULL;
+                node->lookup = _starpu_mpi_sink_lookup;
+                node->bind_thread = _starpu_mpi_sink_bind_thread;
+                node->execute = _starpu_sink_common_execute;
+                node->allocate = _starpu_sink_common_allocate;
+                node->free = _starpu_sink_common_free;
+
+
+        }
 		break;
-
-	case STARPU_NODE_MPI_SINK:
-		STARPU_ABORT();
-		break;
-#endif /* STARPU_USE_MPI */
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
 
 	default:
 		STARPU_ASSERT(0);
@@ -243,8 +307,12 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 	mp_message_list_init(&node->message_queue);
 	STARPU_PTHREAD_MUTEX_INIT(&node->message_queue_mutex,NULL);
 
+        STARPU_PTHREAD_MUTEX_INIT(&node->connection_mutex, NULL);
+
+        _starpu_mp_event_list_init(&node->event_list);
+
 	/* If the node is a sink then we must initialize some field */
-	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK)
+	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK || node->kind == STARPU_NODE_MPI_SINK)
 	{
 		int i;
 		node->is_running = 1;
@@ -258,7 +326,6 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 		}
 		mp_barrier_list_init(&node->barrier_list);
 		STARPU_PTHREAD_MUTEX_INIT(&node->barrier_mutex,NULL);
-
 		STARPU_PTHREAD_BARRIER_INIT(&node->init_completed_barrier, NULL, node->nb_cores+1);
 
 		node->launch_workers(node);
@@ -276,7 +343,7 @@ void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 	STARPU_PTHREAD_MUTEX_DESTROY(&node->message_queue_mutex);
 
 	/* If the node is a sink then we must destroy some field */
-	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK)
+	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK || node->kind == STARPU_NODE_MPI_SINK)
 	{
 		int i;
 		for(i=0; i<node->nb_cores; i++)
@@ -303,6 +370,8 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 {
 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
 
+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
+
 	/* MIC and MPI sizes are given through a int */
 	int command_size = sizeof(enum _starpu_mp_command);
 	int arg_size_size = sizeof(int);
@@ -337,6 +406,8 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 	command = *((enum _starpu_mp_command *) node->buffer);
 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
 
+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
+
 	/* If there is no argument (ie. arg_size == 0),
 	 * let's return the command right now */
 	if (!(*arg_size))

+ 88 - 53
src/drivers/mp_common/mp_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,7 @@
 #include <common/barrier.h>
 #include <common/thread.h>
 #include <datawizard/interfaces/data_interface.h>
+#include <datawizard/copy_driver.h>
 
 #ifdef STARPU_USE_MP
 
@@ -52,10 +53,21 @@ enum _starpu_mp_command
 	STARPU_MP_COMMAND_ANSWER_ALLOCATE,
 	STARPU_MP_COMMAND_ERROR_ALLOCATE,
 	STARPU_MP_COMMAND_FREE,
+        /* Synchronous send */
 	STARPU_MP_COMMAND_RECV_FROM_HOST,
 	STARPU_MP_COMMAND_SEND_TO_HOST,
 	STARPU_MP_COMMAND_RECV_FROM_SINK,
 	STARPU_MP_COMMAND_SEND_TO_SINK,
+        /* Asynchronous send */
+        STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC,
+        STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED,
+	STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC,
+	STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED,
+	STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC,
+	STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED,
+	STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC,
+	STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED,
+
 	STARPU_MP_COMMAND_TRANSFER_COMPLETE,
 	STARPU_MP_COMMAND_SINK_NBCORES,
 	STARPU_MP_COMMAND_ANSWER_SINK_NBCORES,
@@ -88,13 +100,16 @@ union _starpu_mp_connection
 #ifdef STARPU_USE_SCC
 	int scc_nodeid;
 #endif
-	int mpi_nodeid;
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	int mpi_remote_nodeid;
+#endif
 };
 
 struct _starpu_mp_transfer_command
 {
 	size_t size;
 	void *addr;
+        void *event;
 };
 
 struct _starpu_mp_transfer_command_to_device
@@ -102,6 +117,7 @@ struct _starpu_mp_transfer_command_to_device
 	int devid;
 	size_t size;
 	void *addr;
+        void *event;
 };
 
 LIST_TYPE(mp_barrier,
@@ -129,6 +145,12 @@ struct mp_task
  	struct mp_barrier* mp_barrier;
 };
 
+LIST_TYPE(_starpu_mp_event,
+                struct _starpu_async_channel event;
+                void * remote_event;
+                enum _starpu_mp_command answer_cmd;
+);
+
 
 /* Message-passing working node, whether source
  * or sink */
@@ -167,61 +189,74 @@ struct _starpu_mp_node
 	 * sink it controls */
 	union _starpu_mp_connection mp_connection;
 
-	/* Only MIC use this for now !!
-	 * Connection used for data transfers between the host and his sink. */
-	union _starpu_mp_connection host_sink_dt_connection;
+        /* Only MIC use this for now !!
+         * Connection used for data transfers between the host and his sink. */
+        union _starpu_mp_connection host_sink_dt_connection;
 
-	/* Only MIC use this for now !!
-	 * Only sink use this for now !!
-	 * Connection used for data transfer between devices.
-	 * A sink opens a connection with each other sink,
-	 * thus each sink can directly send data to each other.
-	 * For sink :
-	 *  - sink_sink_dt_connections[i] is the connection to the sink number i.
-	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
-	union _starpu_mp_connection *sink_sink_dt_connections;
-
-	/* */
-	starpu_pthread_barrier_t init_completed_barrier; 
-	
-	/* table to store pointer of the thread workers*/
-	void* thread_table;
+        /* Mutex to protect the interleaving of communications when using one thread per node,
+         * for instance, when a thread transfers piece of data and an other wants to use
+         * a sink_to_sink communication */
+        starpu_pthread_mutex_t connection_mutex;
+
+        /* Only MIC use this for now !!
+         * Only sink use this for now !!
+         * Connection used for data transfer between devices.
+         * A sink opens a connection with each other sink,
+         * thus each sink can directly send data to each other.
+         * For sink :
+         *  - sink_sink_dt_connections[i] is the connection to the sink number i.
+         *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
+        union _starpu_mp_connection *sink_sink_dt_connections;
+
+        /* This list contains events
+         * about asynchronous request
+         */
+        struct _starpu_mp_event_list event_list;
+
+        /* */
+        starpu_pthread_barrier_t init_completed_barrier; 
+
+        /* table to store pointer of the thread workers*/
+        void* thread_table;
 
         /*list where threads add messages to send to the source node */
         struct mp_message_list message_queue;
-	starpu_pthread_mutex_t message_queue_mutex;
-
-	/*list of barrier for combined worker*/
-	struct mp_barrier_list barrier_list;
-	starpu_pthread_mutex_t barrier_mutex;
-
-	/*table where worker comme pick task*/
-	struct mp_task ** run_table;
-	sem_t * sem_run_table;
-
-	/* Node general functions */
-	void (*init)(struct _starpu_mp_node *node);
-	void (*launch_workers)(struct _starpu_mp_node *node);
-	void (*deinit)(struct _starpu_mp_node *node);
-	void (*report_error)(const char *, const char *, const int, const int);
-
-	/* Message passing */
-	int (*mp_recv_is_ready)(const struct _starpu_mp_node *);
-	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
-	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
-
-	/* Data transfers */
-	void (*dt_send)(const struct _starpu_mp_node *, void *, int);
-	void (*dt_recv)(const struct _starpu_mp_node *, void *, int);
-	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
-	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
-
-	void (*(*get_kernel_from_job)(const struct _starpu_mp_node *,struct _starpu_job *))(void);
-	void (*(*lookup)(const struct _starpu_mp_node *, char* ))(void);
-	void (*bind_thread)(const struct _starpu_mp_node *, int,int *,int);
-	void (*execute)(struct _starpu_mp_node *, void *, int);
-	void (*allocate)(const struct _starpu_mp_node *, void *, int);
-	void (*free)(const struct _starpu_mp_node *, void *, int);
+        starpu_pthread_mutex_t message_queue_mutex;
+
+        /*list of barrier for combined worker*/
+        struct mp_barrier_list barrier_list;
+        starpu_pthread_mutex_t barrier_mutex;
+
+        /*table where worker comme pick task*/
+        struct mp_task ** run_table;
+        sem_t * sem_run_table;
+
+        /* Node general functions */
+        void (*init)            (struct _starpu_mp_node *node);
+        void (*launch_workers)  (struct _starpu_mp_node *node);
+        void (*deinit)          (struct _starpu_mp_node *node);
+        void (*report_error)    (const char *, const char *, const int, const int);
+
+        /* Message passing */
+        int (*mp_recv_is_ready) (const struct _starpu_mp_node *);
+        void (*mp_send)         (const struct _starpu_mp_node *, void *, int);
+        void (*mp_recv)         (const struct _starpu_mp_node *, void *, int);
+
+        /* Data transfers */
+        void (*dt_send)             (const struct _starpu_mp_node *, void *, int, void *);
+        void (*dt_recv)             (const struct _starpu_mp_node *, void *, int, void *);
+        void (*dt_send_to_device)   (const struct _starpu_mp_node *, int, void *, int, void *);
+        void (*dt_recv_from_device) (const struct _starpu_mp_node *, int, void *, int, void *);
+
+        /* Test async transfers */
+        int (*dt_test) (struct _starpu_async_channel *);
+
+        void (*(*get_kernel_from_job)   (const struct _starpu_mp_node *,struct _starpu_job *))(void);
+        void (*(*lookup)                (const struct _starpu_mp_node *, char* ))(void);
+        void (*bind_thread)             (const struct _starpu_mp_node *, int,int *,int);
+        void (*execute)                 (struct _starpu_mp_node *, void *, int);
+        void (*allocate)                (const struct _starpu_mp_node *, void *, int);
+        void (*free)                    (const struct _starpu_mp_node *, void *, int);
 };
 
 struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid) STARPU_ATTRIBUTE_MALLOC;

+ 178 - 18
src/drivers/mp_common/sink_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #include <common/config.h>
 #include <common/utils.h>
 #include <drivers/mp_common/mp_common.h>
+#include <drivers/mpi/driver_mpi_common.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <common/barrier.h>
 #include <core/workers.h>
@@ -29,7 +30,6 @@
 
 #include "sink_common.h"
 
-
 /* Return the sink kind of the running process, based on the value of the
  * STARPU_SINK environment variable.
  * If there is no valid value retrieved, return STARPU_INVALID_KIND
@@ -45,7 +45,7 @@ static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
 		return STARPU_NODE_MIC_SINK;
 	else if (!strcmp(node_kind, "STARPU_SCC"))
 		return STARPU_NODE_SCC_SINK;
-	else if (!strcmp(node_kind, "STARPU_MPI"))
+	else if (!strcmp(node_kind, "STARPU_MPI_MS"))
 		return STARPU_NODE_MPI_SINK;
 	else
 		return STARPU_NODE_INVALID_KIND;
@@ -108,46 +108,168 @@ void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRI
 	free(*(void **)(arg));
 }
 
-static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
+static void _starpu_sink_common_copy_from_host_sync(const struct _starpu_mp_node *mp_node,
+					       void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+
+        struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+
+        mp_node->dt_recv(mp_node, cmd->addr, cmd->size, NULL);
+}
+
+
+static void _starpu_sink_common_copy_from_host_async(struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
+        struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+
+        /* For asynchronous transfers, we store events to test them later when they are finished */
+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
+        /* Save the command to send */
+        sink_event->answer_cmd = STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED;
+        sink_event->remote_event = cmd->event;
+
+        /* Set the sender (host) ready because we don't want to wait its ack */
+        struct _starpu_async_channel * async_channel = &sink_event->event;
+        async_channel->type = STARPU_UNUSED;
+        async_channel->starpu_mp_common_finished_sender = -1;
+        async_channel->starpu_mp_common_finished_receiver = 0;
+        async_channel->polling_node_receiver = NULL;
+        async_channel->polling_node_sender = NULL;
+
+        mp_node->dt_recv(mp_node, cmd->addr, cmd->size, &sink_event->event);
+        /* Push event on the list */
+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
+}
+
+
+static void _starpu_sink_common_copy_to_host_sync(const struct _starpu_mp_node *mp_node,
+					     void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+
 	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-	mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
+        /* Save values before sending command to prevent the overwriting */
+        size_t size = cmd->size;
+        void * addr = cmd->addr;
+
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, NULL, 0);
+
+        mp_node->dt_send(mp_node, addr, size, NULL);
 }
 
-static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
+
+static void _starpu_sink_common_copy_to_host_async(struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
 	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-	mp_node->dt_send(mp_node, cmd->addr, cmd->size);
+        /* For asynchronous transfers, we need to say dt_send that we are in async mode 
+         * but we don't push event on list because we don't need to know if it's finished
+         */
+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
+        /* Save the command to send */
+        sink_event->answer_cmd = STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED;
+        sink_event->remote_event = cmd->event;
+
+        /* Set the receiver (host) ready because we don't want to wait its ack */
+        struct _starpu_async_channel * async_channel = &sink_event->event;
+        async_channel->type = STARPU_UNUSED;
+        async_channel->starpu_mp_common_finished_sender = 0;
+        async_channel->starpu_mp_common_finished_receiver = -1;
+        async_channel->polling_node_receiver = NULL;
+        async_channel->polling_node_sender = NULL;
+
+        mp_node->dt_send(mp_node, cmd->addr, cmd->size, &sink_event->event);
+        /* Push event on the list */
+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
 }
 
-static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
+
+static void _starpu_sink_common_copy_from_sink_sync(const struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
 
 	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
 
-	mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+        mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size, NULL);
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_TRANSFER_COMPLETE, NULL, 0);
+}
 
-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_TRANSFER_COMPLETE, NULL, 0);
+
+static void _starpu_sink_common_copy_from_sink_async(struct _starpu_mp_node *mp_node,
+					       void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+        /* For asynchronous transfers, we store events to test them later when they are finished
+        */
+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
+        /* Save the command to send */
+        sink_event->answer_cmd = STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED;
+        sink_event->remote_event = cmd->event;
+
+        /* Set the sender ready because we don't want to wait its ack */
+        struct _starpu_async_channel * async_channel = &sink_event->event;
+        async_channel->type = STARPU_UNUSED;
+        async_channel->starpu_mp_common_finished_sender = -1;
+        async_channel->starpu_mp_common_finished_receiver = 0;
+        async_channel->polling_node_receiver = NULL;
+        async_channel->polling_node_sender = NULL;
+
+        mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size, &sink_event->event);
+        /* Push event on the list */
+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
+}
+
+
+static void _starpu_sink_common_copy_to_sink_sync(const struct _starpu_mp_node *mp_node,
+					     void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+        mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size, NULL);
 }
 
-static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
+
+static void _starpu_sink_common_copy_to_sink_async(struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
 
 	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
 
-	mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+        /* For asynchronous transfers, we need to say dt_send that we are in async mode 
+         * but we don't push event on list because we don't need to know if it's finished
+         */
+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
+        /* Save the command to send */
+        sink_event->answer_cmd = STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED;
+        sink_event->remote_event = cmd->event;
+
+        /* Set the receiver ready because we don't want to wait its ack */
+        struct _starpu_async_channel * async_channel = &sink_event->event;
+        async_channel->type = STARPU_UNUSED;
+        async_channel->starpu_mp_common_finished_sender = 0;
+        async_channel->starpu_mp_common_finished_receiver = -1;
+        async_channel->polling_node_receiver = NULL;
+        async_channel->polling_node_sender = NULL;
+
+        mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size, &sink_event->event);
+
+        /* Push event on the list */
+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
 }
 
 
@@ -178,7 +300,7 @@ static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void
 
 	/* Retrieve workers */
 	struct _starpu_worker * workers = &config->workers[baseworkerid];
-	node->dt_recv(node,workers,worker_size);
+	node->dt_recv(node,workers,worker_size, NULL);
 
 	/* Update workers to have coherent field */
 	for(i=0; i<nworkers; i++)
@@ -205,7 +327,7 @@ static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void
 
 	/* Retrieve combined workers */
 	struct _starpu_combined_worker * combined_workers = config->combined_workers;
-	node->dt_recv(node, combined_workers, combined_worker_size);
+	node->dt_recv(node, combined_workers, combined_worker_size, NULL);
 
 	node->baseworkerid = baseworkerid;
 	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);
@@ -267,21 +389,37 @@ void _starpu_sink_common_worker(void)
 					break;
 
 				case STARPU_MP_COMMAND_RECV_FROM_HOST:
-					_starpu_sink_common_copy_from_host(node, arg, arg_size);
+					_starpu_sink_common_copy_from_host_sync(node, arg, arg_size);
 					break;
 
 				case STARPU_MP_COMMAND_SEND_TO_HOST:
-					_starpu_sink_common_copy_to_host(node, arg, arg_size);
+					_starpu_sink_common_copy_to_host_sync(node, arg, arg_size);
 					break;
 
 				case STARPU_MP_COMMAND_RECV_FROM_SINK:
-					_starpu_sink_common_copy_from_sink(node, arg, arg_size);
+					_starpu_sink_common_copy_from_sink_sync(node, arg, arg_size);
 					break;
 
 				case STARPU_MP_COMMAND_SEND_TO_SINK:
-					_starpu_sink_common_copy_to_sink(node, arg, arg_size);
+					_starpu_sink_common_copy_to_sink_sync(node, arg, arg_size);
 					break;
 
+                                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC:
+                                        _starpu_sink_common_copy_from_host_async(node, arg, arg_size);
+                                        break;
+
+                                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC:
+                                        _starpu_sink_common_copy_to_host_async(node, arg, arg_size);
+                                        break;
+
+                                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC:
+                                        _starpu_sink_common_copy_from_sink_async(node, arg, arg_size);
+                                        break;
+
+                                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC:
+                                        _starpu_sink_common_copy_to_sink_async(node, arg, arg_size);
+                                        break;
+
 				case STARPU_MP_COMMAND_SYNC_WORKERS:
 					_starpu_sink_common_recv_workers(node, arg, arg_size);
 					break;
@@ -307,6 +445,24 @@ void _starpu_sink_common_worker(void)
 		{
 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 		}
+
+        if(!_starpu_mp_event_list_empty(&node->event_list))
+        {
+            struct _starpu_mp_event * sink_event = _starpu_mp_event_list_pop_front(&node->event_list);
+            if (node->dt_test(&sink_event->event))
+            {
+                /* send ACK to host */
+                _starpu_mp_common_send_command(node, sink_event->answer_cmd , &sink_event->remote_event, sizeof(sink_event->remote_event));
+                _starpu_mp_event_delete(sink_event);
+            }
+            else
+            {
+                /* try later */
+                 _starpu_mp_event_list_push_back(&node->event_list, sink_event);
+            }
+            
+
+        }
 	}
 
 	STARPU_PTHREAD_KEY_DELETE(worker_key);
@@ -314,6 +470,10 @@ void _starpu_sink_common_worker(void)
 	/* Deinitialize the node and release it */
 	_starpu_mp_common_node_destroy(node);
 
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+        _starpu_mpi_common_mp_deinit();
+#endif
+
 	exit(0);
 }
 

+ 532 - 130
src/drivers/mp_common/source_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,9 +24,25 @@
 
 
 #include <datawizard/coherency.h>
+#include <datawizard/memory_nodes.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
 
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+struct starpu_save_thread_env
+{
+        struct starpu_task * current_task;
+        struct _starpu_worker * current_worker;
+        struct _starpu_worker_set * current_worker_set;
+        unsigned * current_mem_node;
+#ifdef STARPU_OPENMP
+        struct starpu_omp_thread * current_omp_thread;
+        struct starpu_omp_task * current_omp_task;
+#endif
+};
+
+struct starpu_save_thread_env save_thread_env[STARPU_MAXMPIDEVS];
+#endif
 
 /* Finalize the execution of a task by a worker*/
 static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
@@ -67,7 +83,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 
 
 /* Complete the execution of the job */
-static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *workerset, void * arg, int arg_size)
+static int _starpu_src_common_process_completed_job(struct _starpu_mp_node *node, struct _starpu_worker_set *workerset, void * arg, int arg_size, int stored)
 {
 	int coreid;
 
@@ -80,6 +96,10 @@ static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *w
 
 	struct _starpu_worker * old_worker = _starpu_get_local_worker_key();
 
+        /* if arg is not copied we release the mutex */
+        if (!stored)
+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+
 	_starpu_set_local_worker_key(worker);
 	_starpu_src_common_finalize_job (j, worker);
 	_starpu_set_local_worker_key(old_worker);
@@ -89,12 +109,17 @@ static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *w
 }
 
 /* Tell the scheduler when the execution has begun */
-static void _starpu_src_common_pre_exec(void * arg, int arg_size)
+static void _starpu_src_common_pre_exec(struct _starpu_mp_node *node, void * arg, int arg_size, int stored)
 {
 	int cb_workerid, i;
 	STARPU_ASSERT(sizeof(cb_workerid) == arg_size);
 	cb_workerid = *(int *) arg;
 	struct _starpu_combined_worker *combined_worker = _starpu_get_combined_worker_struct(cb_workerid);
+
+        /* if arg is not copied we release the mutex */
+        if (!stored)
+                STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+
 	for(i=0; i < combined_worker->worker_size; i++)
 	{
 		struct _starpu_worker * worker = _starpu_get_worker_struct(combined_worker->combined_workerid[i]);
@@ -107,25 +132,43 @@ static void _starpu_src_common_pre_exec(void * arg, int arg_size)
  * return 0 if the message has not been handle (it's certainly mean that it's a synchronous message)
  * return 1 if the message has been handle
  */
-static int _starpu_src_common_handle_async(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED,
+static int _starpu_src_common_handle_async(struct _starpu_mp_node *node,
 		void * arg, int arg_size,
-		enum _starpu_mp_command answer)
+		enum _starpu_mp_command answer, int stored)
 {
-	struct _starpu_worker_set * worker_set=NULL;
-	switch(answer)
-	{
-		case STARPU_MP_COMMAND_EXECUTION_COMPLETED:
-			worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
-			_starpu_src_common_process_completed_job(worker_set, arg, arg_size);
-			break;
-		case STARPU_MP_COMMAND_PRE_EXECUTION:
-			_starpu_src_common_pre_exec(arg,arg_size);
-			break;
-		default:
-			return 0;
-			break;
-	}
-	return 1;
+        struct _starpu_worker_set * worker_set = NULL;
+        switch(answer)
+        {
+                case STARPU_MP_COMMAND_EXECUTION_COMPLETED:
+                        worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
+                        _starpu_src_common_process_completed_job(node, worker_set, arg, arg_size, stored);
+                        break;
+                case STARPU_MP_COMMAND_PRE_EXECUTION:
+                        _starpu_src_common_pre_exec(node, arg,arg_size, stored);
+                        break;
+                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED:
+                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED:
+                        {
+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
+                                event->starpu_mp_common_finished_receiver--;
+                                if (!stored)
+                                        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+                                break;
+                        }
+                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED:
+                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED:
+                        {
+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
+                                event->starpu_mp_common_finished_sender--;
+                                if (!stored)
+                                        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+                                break;
+                        }
+                default:
+                        return 0;
+                        break;
+        }
+        return 1;
 }
 
 /* Handle all message which have been stored in the message_queue */
@@ -137,10 +180,14 @@ static void _starpu_src_common_handle_stored_async(struct _starpu_mp_node *node)
 	{
 		/* We pop a message and handle it */
 		struct mp_message * message = mp_message_list_pop_back(&node->message_queue);
+                /* Release mutex during handle */
+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 		_starpu_src_common_handle_async(node, message->buffer,
-				message->size, message->type);
+				message->size, message->type, 1);
 		free(message->buffer);
 		mp_message_delete(message);
+                /* Take it again */
+                STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
 	}
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 }
@@ -167,8 +214,25 @@ int _starpu_src_common_store_message(struct _starpu_mp_node *node,
 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 			return 1;
 			break;
-		default:
-			return 0;
+                        /* For ASYNC commands don't store them, update event */
+                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED:
+                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED:
+                        {
+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
+                                event->starpu_mp_common_finished_receiver--;
+                                return 1;
+                                break;
+                        }
+                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED:
+                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED:
+                        {
+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
+                                event->starpu_mp_common_finished_sender--;
+                                return 1;
+                                break;
+                        }
+                default:
+                        return 0;
 			break;
 	}
 }
@@ -195,7 +259,7 @@ static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
 	void *arg;
 	int arg_size;
 	answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
-	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer))
+	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer, 0))
 	{
 		printf("incorrect commande: unknown command or sync command");
 		STARPU_ASSERT(0);
@@ -237,13 +301,15 @@ static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
 
 
 /* Send a request to the sink NODE for the number of cores on it. */
-int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
+int _starpu_src_common_sink_nbcores (struct _starpu_mp_node *node, int *buf)
 {
 
 	enum _starpu_mp_command answer;
 	void *arg;
 	int arg_size = sizeof (int);
 
+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+
 	_starpu_mp_common_send_command (node, STARPU_MP_COMMAND_SINK_NBCORES, NULL, 0);
 
 	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
@@ -252,6 +318,8 @@ int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *bu
 
 	memcpy (buf, arg, arg_size);
 
+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+
 	return 0;
 }
 
@@ -270,6 +338,8 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 	/* strlen ignore the terminating '\0' */
 	arg_size = (strlen(func_name) + 1) * sizeof(char);
 
+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+
 	//_STARPU_DEBUG("Looking up %s\n", func_name);
 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_LOOKUP, (void *) func_name,
 			arg_size);
@@ -277,9 +347,11 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 	answer = _starpu_src_common_wait_command_sync(node, (void **) &arg,
 			&arg_size);
 
+
 	if (answer == STARPU_MP_COMMAND_ERROR_LOOKUP)
 	{
 		_STARPU_DISP("Error looking up symbol %s\n", func_name);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
 		return -ESPIPE;
 	}
 
@@ -290,6 +362,8 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
 	memcpy(func_ptr, arg, arg_size);
 
+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+
 	//_STARPU_DEBUG("got %p\n", *func_ptr);
 
 	return 0;
@@ -314,7 +388,6 @@ int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
 		unsigned nb_interfaces,
 		void *cl_arg, size_t cl_arg_size)
 {
-
 	void *buffer, *arg =NULL;
 	uintptr_t buffer_ptr;
 	int buffer_size = 0, arg_size =0;
@@ -384,14 +457,22 @@ int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
 	if (cl_arg)
 		memcpy((void*) buffer_ptr, cl_arg, cl_arg_size);
 
+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+
 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_EXECUTE, buffer, buffer_size);
+
 	enum _starpu_mp_command answer = _starpu_src_common_wait_command_sync(node, &arg, &arg_size);
 
-	if (answer == STARPU_MP_COMMAND_ERROR_EXECUTE)
-		return -EINVAL;
+        if (answer == STARPU_MP_COMMAND_ERROR_EXECUTE)
+        {
+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+                return -EINVAL;
+        }
 
 	STARPU_ASSERT(answer == STARPU_MP_COMMAND_EXECUTION_SUBMITTED);
 
+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+
 	free(buffer);
 
 	return 0;
@@ -451,85 +532,230 @@ int _starpu_src_common_allocate(struct _starpu_mp_node *mp_node,
 	void *arg;
 	int arg_size;
 
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+
 	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_ALLOCATE, &size,
 			sizeof(size));
 
 	answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
 
-	if (answer == STARPU_MP_COMMAND_ERROR_ALLOCATE)
-		return 1;
+        if (answer == STARPU_MP_COMMAND_ERROR_ALLOCATE)
+        {
+                STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+                return 1;
+        }
 
 	STARPU_ASSERT(answer == STARPU_MP_COMMAND_ANSWER_ALLOCATE &&
 			arg_size == sizeof(*addr));
-
+    
 	memcpy(addr, arg, arg_size);
 
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
 	return 0;
 }
 
 /* Send a request to the sink linked to the MP_NODE to deallocate the memory
  * area pointed by ADDR.
  */
-void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
+void _starpu_src_common_free(struct _starpu_mp_node *mp_node,
 		void *addr)
 {
-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_FREE, &addr, sizeof(addr));
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_FREE, &addr, sizeof(addr));
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
 }
 
-/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
-*/
-int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
+/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE with a
+ * synchronous mode.
+ */
+int _starpu_src_common_copy_host_to_sink_sync(struct _starpu_mp_node *mp_node,
 		void *src, void *dst, size_t size)
 {
-	struct _starpu_mp_transfer_command cmd = {size, dst};
+        struct _starpu_mp_transfer_command cmd = {size, dst, NULL};
 
-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST, &cmd, sizeof(cmd));
-	mp_node->dt_send(mp_node, src, size);
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
 
-	return 0;
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST, &cmd, sizeof(cmd));
+
+        mp_node->dt_send(mp_node, src, size, NULL);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
+        return 0;
 }
 
-/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
-*/
-int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
+/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE with an
+ * asynchronous mode.
+ */
+int _starpu_src_common_copy_host_to_sink_async(struct _starpu_mp_node *mp_node,
+		void *src, void *dst, size_t size, void * event)
+{
+        struct _starpu_mp_transfer_command cmd = {size, dst, event};
+
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+
+        /* For asynchronous transfers, we save informations
+         * to test is they are finished
+         */
+        struct _starpu_async_channel * async_channel = event;
+        async_channel->polling_node_receiver = mp_node;
+
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC, &cmd, sizeof(cmd));
+
+        mp_node->dt_send(mp_node, src, size, event);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
+        return -EAGAIN;
+}
+
+/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST
+ * with a synchronous mode.
+ */
+int _starpu_src_common_copy_sink_to_host_sync(struct _starpu_mp_node *mp_node,
 		void *src, void *dst, size_t size)
 {
-	struct _starpu_mp_transfer_command cmd = {size, src};
+        enum _starpu_mp_command answer;
+        void *arg;
+        int arg_size;
+        struct _starpu_mp_transfer_command cmd = {size, src, NULL};
 
-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, &cmd, sizeof(cmd));
-	mp_node->dt_recv(mp_node, dst, size);
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
 
-	return 0;
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, &cmd, sizeof(cmd));
+
+        answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
+
+        STARPU_ASSERT(answer == STARPU_MP_COMMAND_SEND_TO_HOST);
+
+        mp_node->dt_recv(mp_node, dst, size, NULL);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
+        return 0;
+}
+
+/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST
+ * with an asynchronous mode.
+ */
+int _starpu_src_common_copy_sink_to_host_async(struct _starpu_mp_node *mp_node,
+		void *src, void *dst, size_t size, void * event)
+{
+        struct _starpu_mp_transfer_command cmd = {size, src, event};
+
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+
+        /* For asynchronous transfers, we save informations
+         * to test is they are finished
+         */
+        struct _starpu_async_channel * async_channel = event;
+        async_channel->polling_node_sender = mp_node;
+
+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC, &cmd, sizeof(cmd));
+
+        mp_node->dt_recv(mp_node, dst, size, event);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
+        return -EAGAIN;
 }
 
 /* Tell the sink linked to SRC_NODE to send SIZE bytes of data pointed by SRC
- * to the sink linked to DST_NODE. The latter store them in DST.
+ * to the sink linked to DST_NODE. The latter store them in DST with a synchronous
+ * mode.
  */
-int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
-		const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size)
+int _starpu_src_common_copy_sink_to_sink_sync(struct _starpu_mp_node *src_node,
+		struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size)
 {
-	enum _starpu_mp_command answer;
-	void *arg;
-	int arg_size;
+        enum _starpu_mp_command answer;
+        void *arg;
+        int arg_size;
 
-	struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src};
+        struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src, NULL};
 
-	/* Tell source to send data to dest. */
-	_starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK, &cmd, sizeof(cmd));
+        /* lock the node with the little peer_id first to prevent deadlock */
+        if (src_node->peer_id > dst_node->peer_id)
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
+        }
+        else
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
+        }
 
-	cmd.devid = src_node->peer_id;
-	cmd.size = size;
-	cmd.addr = dst;
+        /* Tell source to send data to dest. */
+        _starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK, &cmd, sizeof(cmd));
 
-	/* Tell dest to receive data from source. */
-	_starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK, &cmd, sizeof(cmd));
+        /* Release the source as fast as possible */
+        STARPU_PTHREAD_MUTEX_UNLOCK(&src_node->connection_mutex);
 
-	/* Wait for answer from dest to know wether transfer is finished. */
-	answer = _starpu_mp_common_recv_command(dst_node, &arg, &arg_size);
+        cmd.devid = src_node->peer_id;
+        cmd.size = size;
+        cmd.addr = dst;
 
-	STARPU_ASSERT(answer == STARPU_MP_COMMAND_TRANSFER_COMPLETE);
+        /* Tell dest to receive data from source. */
+        _starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK, &cmd, sizeof(cmd));
 
-	return 0;
+        /* Wait for answer from dest to know wether transfer is finished. */
+        answer = _starpu_src_common_wait_command_sync(dst_node, &arg, &arg_size);
+
+        STARPU_ASSERT(answer == STARPU_MP_COMMAND_TRANSFER_COMPLETE);
+
+        /* Release the receiver when we received the acknowlegment */
+        STARPU_PTHREAD_MUTEX_UNLOCK(&dst_node->connection_mutex);
+
+        return 0;
+}
+
+/* Tell the sink linked to SRC_NODE to send SIZE bytes of data pointed by SRC
+ * to the sink linked to DST_NODE. The latter store them in DST with an asynchronous
+ * mode.
+ */
+int _starpu_src_common_copy_sink_to_sink_async(struct _starpu_mp_node *src_node,
+		struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size, void * event)
+{
+        struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src, event};
+
+        /* lock the node with the little peer_id first to prevent deadlock */
+        if (src_node->peer_id > dst_node->peer_id)
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
+        }
+        else
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
+        }
+
+        /* For asynchronous transfers, we save informations
+         * to test is they are finished
+         */
+        struct _starpu_async_channel * async_channel = event;
+        async_channel->polling_node_sender = src_node; 
+        async_channel->polling_node_receiver = dst_node; 
+        /* Increase number of ack waited */
+        async_channel->starpu_mp_common_finished_receiver++;
+        async_channel->starpu_mp_common_finished_sender++;
+
+        /* Tell source to send data to dest. */
+        _starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC, &cmd, sizeof(cmd));
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&src_node->connection_mutex);
+
+        cmd.devid = src_node->peer_id;
+        cmd.size = size;
+        cmd.addr = dst;
+
+        /* Tell dest to receive data from source. */
+        _starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC, &cmd, sizeof(cmd));
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&dst_node->connection_mutex);
+
+        return -EAGAIN;
 }
 
 /* 5 functions to determine the executable to run on the device (MIC, SCC,
@@ -643,6 +869,44 @@ int _starpu_src_common_locate_file(char *located_file_name,
 	return 1;
 }
 
+
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+void _starpu_src_common_init_switch_env(unsigned this)
+{
+        save_thread_env[this].current_task = starpu_task_get_current();
+        save_thread_env[this].current_worker = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_key);
+        save_thread_env[this].current_worker_set = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_set_key);
+        save_thread_env[this].current_mem_node = STARPU_PTHREAD_GETSPECIFIC(_starpu_memory_node_key);
+#ifdef STARPU_OPENMP
+        save_thread_env[this].current_omp_thread = STARPU_PTHREAD_GETSPECIFIC(omp_thread_key);
+        save_thread_env[this].current_omp_task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
+#endif
+}
+
+static void _starpu_src_common_switch_env(unsigned old, unsigned new)
+{
+        save_thread_env[old].current_task = starpu_task_get_current();
+        save_thread_env[old].current_worker = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_key);
+        save_thread_env[old].current_worker_set = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_set_key);
+        save_thread_env[old].current_mem_node = STARPU_PTHREAD_GETSPECIFIC(_starpu_memory_node_key);
+#ifdef STARPU_OPENMP
+        save_thread_env[old].current_omp_thread = STARPU_PTHREAD_GETSPECIFIC(omp_thread_key);
+        save_thread_env[old].current_omp_task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
+#endif
+
+
+        _starpu_set_current_task(save_thread_env[new].current_task);
+        STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_key, save_thread_env[new].current_worker);
+        STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_set_key, save_thread_env[new].current_worker_set);
+        STARPU_PTHREAD_SETSPECIFIC(_starpu_memory_node_key, save_thread_env[new].current_mem_node);
+#ifdef STARPU_OPENMP
+        STARPU_PTHREAD_SETSPECIFIC(omp_thread_key, save_thread_env[new].current_omp_thread);
+        STARPU_PTHREAD_SETSPECIFIC(omp_task_key, save_thread_env[new].current_omp_task); 
+#endif
+}
+#endif
+
+
 /* Send workers to the sink node
  */
 static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int baseworkerid, int nworkers)
@@ -657,96 +921,234 @@ static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int b
 	msg[3] = baseworkerid;
 	msg[4] = starpu_worker_get_count();
 
+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+
 	/* tell the sink node that we will send him all workers */
 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_SYNC_WORKERS,
 			&msg, sizeof(msg));
 
 	/* Send all worker to the sink node */
-	node->dt_send(node,&config->workers[baseworkerid],worker_size);
+	node->dt_send(node,&config->workers[baseworkerid],worker_size, NULL);
 
 	/* Send all combined workers to the sink node */
-	node->dt_send(node, &config->combined_workers,combined_worker_size);
+	node->dt_send(node, &config->combined_workers,combined_worker_size, NULL);
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
 }
 
-/* Function looping on the source node */
-void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
-		unsigned baseworkerid,
-		struct _starpu_mp_node * mp_node)
+/* Callback used when a buffer is send asynchronously to the sink */
+static void _starpu_src_common_send_data_callback(void *arg)
 {
-	unsigned memnode = worker_set->workers[0].memory_node;
-	struct starpu_task **tasks;
+        struct _starpu_worker * worker = (struct _starpu_worker *) arg;
 
-	_STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*worker_set->nworkers);
+        /* increase the number of buffer received */
+        STARPU_WMB();
+        (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_sent, 1);
+}
 
-	_starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
 
-	/*main loop*/
-	while (_starpu_machine_is_running())
-	{
-		int res = 0;
+static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set * worker_set, struct _starpu_mp_node * mp_node, struct starpu_task **tasks, unsigned memnode)
+{
+        int res = 0;
 
-		_starpu_may_pause();
+        _starpu_may_pause();
 
 #ifdef STARPU_SIMGRID
-		starpu_pthread_wait_reset(&worker_set->workers[0].wait);
+        starpu_pthread_wait_reset(&worker_set->workers[0].wait);
 #endif
 
-		_STARPU_TRACE_START_PROGRESS(memnode);
-		res |= __starpu_datawizard_progress(memnode, 1, 1);
-		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
-		_STARPU_TRACE_END_PROGRESS(memnode);
 
-		/* Handle message which have been store */
-		_starpu_src_common_handle_stored_async(mp_node);
+        /* Test if async transfers are completed */
+        for (unsigned i = 0; i < worker_set->nworkers; i++)
+        {
+                /* We send all buffers to execute the task */
+                if (worker_set->workers[i].task_sending != NULL && worker_set->workers[i].nb_buffers_sent == STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending))
+                {
+                        int workerid = worker_set->workers[i].workerid;
+
+                        STARPU_RMB();
+                        _STARPU_TRACE_WORKER_END_FETCH_INPUT(NULL, workerid);
+
+                        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending);
+                        unsigned buf;
+                        for (buf = 0; buf < nbuffers; buf++)
+                        {
+                                starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(worker_set->workers[i].task_sending, buf);
+                                struct _starpu_data_replicate *replicate = &handle->per_node[memnode];
+                                /* Release our refcnt */
+                                _starpu_spin_lock(&handle->header_lock);
+                                replicate->refcnt--;
+                                STARPU_ASSERT(replicate->refcnt >= 0);
+                                STARPU_ASSERT(handle->busy_count > 0);
+                                handle->busy_count--;
+                                if (!_starpu_data_check_not_busy(handle))
+                                        _starpu_spin_unlock(&handle->header_lock);
+                        }
+
+                        /* Execute the task */
+                        struct _starpu_job * j = _starpu_get_job_associated_to_task(worker_set->workers[i].task_sending);
+                        _starpu_set_local_worker_key(&worker_set->workers[i]);
+                        res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
+                        switch (res)
+                        {
+                                case 0:
+                                        /* The task task has been launched with no error */
+                                        break;
+                                case -EAGAIN:
+                                        _STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
+                                        _starpu_push_task_to_workers(worker_set->workers[i].task_sending);
+                                        STARPU_ABORT();
+                                        continue;
+                                        break;
+                                default:
+                                        STARPU_ASSERT(0);
+                        }
+
+                        /* Reset it */
+                        worker_set->workers[i].task_sending = NULL;
+                        worker_set->workers[i].nb_buffers_sent = 0;
+                }
+        }
+
+        _STARPU_TRACE_START_PROGRESS(memnode);
+        res |= __starpu_datawizard_progress(1, 1);
+        _STARPU_TRACE_END_PROGRESS(memnode);
+
+        /* Handle message which have been store */
+        _starpu_src_common_handle_stored_async(mp_node);
+
+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+
+        /* poll the device for completed jobs.*/
+        while(mp_node->mp_recv_is_ready(mp_node))
+        {
+                _starpu_src_common_recv_async(mp_node);
+                /* Mutex is unlock in _starpu_src_common_recv_async */
+                STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
+        }
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
+
+        /* get task for each worker*/
+        res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
 
-		/* poll the device for completed jobs.*/
-		while(mp_node->mp_recv_is_ready(mp_node))
-			_starpu_src_common_recv_async(mp_node);
+#ifdef STARPU_SIMGRID
+        if (!res)
+                starpu_pthread_wait_wait(&worker_set->workers[0].wait);
+#endif
 
-		/* get task for each worker*/
-		res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
+        /*if at least one worker have pop a task*/
+        if(res != 0)
+        {
+                unsigned i, buf;
+                for(i=0; i<worker_set->nworkers; i++)
+                {
+                        if(tasks[i] != NULL)
+                        {
+                                int workerid = worker_set->workers[i].workerid;
+                                _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
+                                unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(tasks[i]);
+
+                                for (buf = 0; buf < nbuffers; buf++)
+                                {
+                                        starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(tasks[i], buf);
+                                        enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(tasks[i], buf);
+                                        struct _starpu_data_replicate *local_replicate = get_replicate(handle, mode, workerid, memnode);
+
+                                        int ret = _starpu_fetch_data_on_node(handle, memnode, local_replicate, mode, 0, 0, 1,
+                                                        _starpu_src_common_send_data_callback, &worker_set->workers[i], 0, "_starpu_src_common_worker_internal_work");
+                                        STARPU_ASSERT(!ret);
+                                }
+                                worker_set->workers[i].task_sending = tasks[i];
+                        }
+                }
+        }
+
+        /* Handle message which have been store */
+        _starpu_src_common_handle_stored_async(mp_node);
 
-#ifdef STARPU_SIMGRID
-		if (!res)
-			starpu_pthread_wait_wait(&worker_set->workers[0].wait);
+}
+
+
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+/* Function looping on the source node */
+void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set,
+        int ndevices, struct _starpu_mp_node ** mp_node)
+{
+        unsigned memnode[ndevices];
+        unsigned offsetmemnode[ndevices];
+        memset(offsetmemnode, 0, ndevices*sizeof(unsigned));
+
+        int device;
+        int nbworkers = 0;
+        for (device = 0; device < ndevices; device++)
+        {
+                memnode[device] = worker_set[device].workers[0].memory_node;
+                nbworkers += worker_set[device].nworkers;
+                if (device != 0)
+                        offsetmemnode[device] += offsetmemnode[device-1];
+                if (device != ndevices -1)
+                        offsetmemnode[device+1] += worker_set[device].nworkers;
+        }
+
+        struct starpu_task **tasks;
+        _STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*nbworkers);
+
+        for (device = 0; device < ndevices; device++)
+        {
+                struct _starpu_worker *baseworker = &worker_set[device].workers[0];
+                struct _starpu_machine_config *config = baseworker->config;
+                unsigned baseworkerid = baseworker - config->workers;
+                _starpu_src_common_send_workers(mp_node[device], baseworkerid, worker_set[device].nworkers);
+        }
+
+        /*main loop*/
+        while (_starpu_machine_is_running())
+        {
+                for (device = 0; device < ndevices ; device++)
+                {
+                        _starpu_src_common_switch_env(((device-1)+ndevices)%ndevices, device);
+                        _starpu_src_common_worker_internal_work(&worker_set[device], mp_node[device], tasks+offsetmemnode[device], memnode[device]);
+                }
+        }
+        free(tasks);
+
+        for (device = 0; device < ndevices; device++)
+                _starpu_handle_all_pending_node_data_requests(memnode[device]);
+
+        /* In case there remains some memory that was automatically
+         * allocated by StarPU, we release it now. Note that data
+         * coherency is not maintained anymore at that point ! */
+        for (device = 0; device < ndevices; device++)
+                _starpu_free_all_automatically_allocated_buffers(memnode[device]);
+
+}
 #endif
 
-		/*if at least one worker have pop a task*/
-		if(res != 0)
-		{
-			unsigned i;
-			for(i=0; i<worker_set->nworkers; i++)
-			{
-				if(tasks[i] != NULL)
-				{
-					struct _starpu_job * j = _starpu_get_job_associated_to_task(tasks[i]);
-					_starpu_set_local_worker_key(&worker_set->workers[i]);
-					res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
-					switch (res)
-					{
-						case 0:
-							/* The task task has been launched with no error */
-							break;
-						case -EAGAIN:
-							_STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
-							_starpu_push_task_to_workers(tasks[i]);
-							STARPU_ABORT();
-							continue;
-							break;
-						default:
-							STARPU_ASSERT(0);
-					}
-				}
-			}
-		}
-	}
-	free(tasks);
+/* Function looping on the source node */
+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
+		unsigned baseworkerid,
+		struct _starpu_mp_node * mp_node)
+{
+        unsigned memnode = worker_set->workers[0].memory_node;
+        struct starpu_task **tasks;
+
+        _STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*worker_set->nworkers);
+
+        _starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
+
+        /*main loop*/
+        while (_starpu_machine_is_running())
+        {
+                _starpu_src_common_worker_internal_work(worker_set, mp_node, tasks, memnode);
+        }
+        free(tasks);
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
+        _starpu_handle_all_pending_node_data_requests(memnode);
 
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
+        /* In case there remains some memory that was automatically
+         * allocated by StarPU, we release it now. Note that data
+         * coherency is not maintained anymore at that point ! */
+        _starpu_free_all_automatically_allocated_buffers(memnode);
 
 }

+ 22 - 10
src/drivers/mp_common/source_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,15 +28,12 @@
 
 enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
 							     void ** arg, int* arg_size);
-void _starpu_src_common_recv_async(struct _starpu_worker_set *worker_set, 
-				   struct _starpu_mp_node * baseworker_node);
-
 int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
 		void * arg, int arg_size, enum _starpu_mp_command answer);
 
 enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size);
 
-int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
+int _starpu_src_common_sink_nbcores (struct _starpu_mp_node *node, int *buf);
 
 int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
 			      void (**func_ptr)(void), const char *func_name);
@@ -44,7 +41,7 @@ int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
 int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
 				void **addr, size_t size);
 
-void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
+void _starpu_src_common_free(struct _starpu_mp_node *mp_node,
 			     void *addr);
 
 int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
@@ -57,14 +54,23 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 				      void *cl_arg, size_t cl_arg_size);
 
 
-int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
+int _starpu_src_common_copy_host_to_sink_sync(struct _starpu_mp_node *mp_node,
 					 void *src, void *dst, size_t size);
 
-int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
+int _starpu_src_common_copy_sink_to_host_sync(struct _starpu_mp_node *mp_node,
 					 void *src, void *dst, size_t size);
 
-int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
-					 const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size);
+int _starpu_src_common_copy_sink_to_sink_sync(struct _starpu_mp_node *src_node,
+					 struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size);
+
+int _starpu_src_common_copy_host_to_sink_async(struct _starpu_mp_node *mp_node,
+					 void *src, void *dst, size_t size, void *event);
+
+int _starpu_src_common_copy_sink_to_host_async(struct _starpu_mp_node *mp_node,
+					 void *src, void *dst, size_t size, void *event);
+
+int _starpu_src_common_copy_sink_to_sink_async(struct _starpu_mp_node *src_node,
+					 struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size, void *event);
 
 int _starpu_src_common_locate_file(char *located_file_name,
 				   const char *env_file_name, const char *env_mic_path,
@@ -75,6 +81,12 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 			       unsigned baseworkerid, 
 			       struct _starpu_mp_node * node_set);
 
+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+void _starpu_src_common_init_switch_env(unsigned this);
+void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set,
+                 int ndevices,
+                 struct _starpu_mp_node ** mp_node);
+#endif
 
 #endif /* STARPU_USE_MP */
 

+ 558 - 0
src/drivers/mpi/driver_mpi_common.c

@@ -0,0 +1,558 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <mpi.h>
+#include <core/workers.h>
+#include <core/perfmodel/perfmodel.h>
+#include <drivers/mp_common/source_common.h>
+#include "driver_mpi_common.h"
+
+#define NITER 32
+#define SIZE_BANDWIDTH (1024*1024)
+
+#define DRIVER_MPI_MASTER_NODE_DEFAULT 0
+
+static int mpi_initialized = 0;
+static int extern_initialized = 0;
+static int src_node_id;
+
+static void _starpu_mpi_set_src_node_id()
+{
+        int node_id = starpu_get_env_number("STARPU_MPI_MASTER_NODE");
+
+        if (node_id != -1)
+        {
+                int nb_proc, id_proc;
+                MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
+                MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+                if (node_id < nb_proc)
+                {
+                        src_node_id = node_id;
+                        return;
+                }
+                else if (id_proc == DRIVER_MPI_MASTER_NODE_DEFAULT)
+                {
+                        /* Only one node prints the error message. */
+                        _STARPU_DISP("The node you specify to be the master is "
+                                        "greater than the total number of nodes.\n"
+                                        "Taking node %d by default...\n", DRIVER_MPI_MASTER_NODE_DEFAULT);
+                }
+        }
+
+        /* Node by default. */
+        src_node_id = DRIVER_MPI_MASTER_NODE_DEFAULT;
+}
+
+int _starpu_mpi_common_mp_init()
+{
+        //Here we supposed the programmer called two times starpu_init.
+        if (mpi_initialized)
+                return -ENODEV;
+
+        mpi_initialized = 1;
+
+        if (MPI_Initialized(&extern_initialized) != MPI_SUCCESS)
+                STARPU_ABORT_MSG("Cannot check if MPI is initialized or not !");
+
+        //Here MPI_Init or MPI_Init_thread is already called
+        if (!extern_initialized)
+        {
+
+#if defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
+                int required = MPI_THREAD_MULTIPLE;
+#else
+                int required = MPI_THREAD_FUNNELED;
+#endif
+
+                int thread_support;
+                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
+
+                if (thread_support != required)
+                {
+                        if (required == MPI_THREAD_MULTIPLE)
+                                _STARPU_DISP("MPI doesn't support MPI_THREAD_MULTIPLE option. MPI Master-Slave can have problems if multiple slaves are launched. \n");
+                        if (required == MPI_THREAD_FUNNELED)
+                                _STARPU_DISP("MPI doesn't support MPI_THREAD_FUNNELED option. Many errors can occur. \n");
+                }
+        }
+
+        /* Find which node is the master */
+        _starpu_mpi_set_src_node_id();
+
+        return 1;
+}
+
+void _starpu_mpi_common_mp_deinit()
+{
+        if (!extern_initialized)
+                MPI_Finalize();    
+}
+
+int _starpu_mpi_common_is_src_node()
+{   
+        int id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+        return id_proc == src_node_id;
+} 
+
+int _starpu_mpi_common_get_src_node()
+{
+        return src_node_id;
+}
+
+int _starpu_mpi_common_is_mp_initialized()
+{
+        return mpi_initialized;
+}
+
+/* common parts to initialize a source or a sink node */
+void _starpu_mpi_common_mp_initialize_src_sink(struct _starpu_mp_node *node)
+{
+        struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
+
+        node->nb_cores = topology->nhwcpus;
+}
+
+int _starpu_mpi_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
+{
+        int res, source;
+        int flag = 0;
+        int id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+        if (id_proc == src_node_id)
+        {
+                /* Source has mp_node defined */
+                source = mp_node->mp_connection.mpi_remote_nodeid;
+        }
+        else
+        {
+                /* Sink can have sink to sink message */
+                source = MPI_ANY_SOURCE;
+        }
+
+        res = MPI_Iprobe(source, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot test if we received a message !");
+
+        return flag;
+}
+
+/* SEND to source node */
+void _starpu_mpi_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event)
+{
+        int res;
+        int id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+        //printf("envoi %d B to %d\n", len, node->mp_connection.mpi_remote_nodeid);
+
+        if (event)
+        {
+                /* Asynchronous send */
+                struct _starpu_async_channel * channel = event;
+                channel->event.mpi_ms_event.is_sender = 1;
+
+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
+                if (channel->type == STARPU_UNUSED)
+                        channel->event.mpi_ms_event.requests = NULL;
+
+                /* Initialize the list */
+                if (channel->event.mpi_ms_event.requests == NULL)
+                {
+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
+                }
+
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
+
+                res = MPI_Isend(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
+
+                channel->starpu_mp_common_finished_receiver++;
+                channel->starpu_mp_common_finished_sender++;
+
+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
+        } 
+        else
+        {
+                /* Synchronous send */
+                res = MPI_Send(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, SYNC_TAG, MPI_COMM_WORLD);
+        }
+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
+}
+
+void _starpu_mpi_common_mp_send(const struct _starpu_mp_node *node, void *msg, int len)
+{
+        _starpu_mpi_common_send(node, msg, len, NULL);
+}
+
+
+/* RECV to source node */
+void _starpu_mpi_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
+{
+        int res;
+        int id_proc;
+        MPI_Status s;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+        //printf("recv %d B from %d in %p\n", len, node->mp_connection.mpi_remote_nodeid, msg);
+
+        if (event)
+        {
+                /* Asynchronous recv */
+                struct _starpu_async_channel * channel = event;
+                channel->event.mpi_ms_event.is_sender = 0;
+
+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
+                if (channel->type == STARPU_UNUSED)
+                        channel->event.mpi_ms_event.requests = NULL;
+
+                /* Initialize the list */
+                if (channel->event.mpi_ms_event.requests == NULL)
+                {
+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
+                }
+
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
+
+                res = MPI_Irecv(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
+
+                channel->starpu_mp_common_finished_receiver++;
+                channel->starpu_mp_common_finished_sender++;
+
+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
+        } 
+        else
+        {
+                /* Synchronous recv */
+                res = MPI_Recv(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, SYNC_TAG, MPI_COMM_WORLD, &s);
+                int num_expected;
+                MPI_Get_count(&s, MPI_BYTE, &num_expected);
+
+                STARPU_ASSERT_MSG(num_expected == len, "MPI Master/Slave received a msg with a size of %d Bytes (expected %d Bytes) !", num_expected, len);
+        }
+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
+}
+
+void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, int len)
+{
+        _starpu_mpi_common_recv(node, msg, len, NULL);
+}
+
+/* SEND to any node */
+void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event)
+{   
+        int res;
+        int id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+        //printf("S_to_D send %d bytes from %d from %p\n", len, dst_devid, msg);
+
+        if (event)
+        {
+                /* Asynchronous send */
+                struct _starpu_async_channel * channel = event;
+                channel->event.mpi_ms_event.is_sender = 1;
+
+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
+                if (channel->type == STARPU_UNUSED)
+                        channel->event.mpi_ms_event.requests = NULL;
+
+                /* Initialize the list */
+                if (channel->event.mpi_ms_event.requests == NULL)
+                {
+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
+                }
+
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
+
+                res = MPI_Isend(msg, len, MPI_BYTE, dst_devid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
+
+                channel->starpu_mp_common_finished_receiver++;
+                channel->starpu_mp_common_finished_sender++;
+
+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
+        } 
+        else
+        {
+                /* Synchronous send */
+                res = MPI_Send(msg, len, MPI_BYTE, dst_devid, SYNC_TAG, MPI_COMM_WORLD);
+        }    
+
+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
+}
+
+/* RECV to any node */
+void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event)
+{
+        int res;
+        int id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+
+        //printf("R_to_D nop recv %d bytes from %d\n", len, src_devid);
+
+        if (event)
+        {
+                /* Asynchronous recv */
+                struct _starpu_async_channel * channel = event;
+                channel->event.mpi_ms_event.is_sender = 0;
+
+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
+                if (channel->type == STARPU_UNUSED)
+                        channel->event.mpi_ms_event.requests = NULL;
+
+                /* Initialize the list */
+                if (channel->event.mpi_ms_event.requests == NULL)
+                {
+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
+                }
+
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
+
+                res = MPI_Irecv(msg, len, MPI_BYTE, src_devid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
+
+                channel->starpu_mp_common_finished_receiver++;
+                channel->starpu_mp_common_finished_sender++;
+
+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
+        } 
+        else
+        {
+                /* Synchronous recv */
+                MPI_Status s;
+                res = MPI_Recv(msg, len, MPI_BYTE, src_devid, SYNC_TAG, MPI_COMM_WORLD, &s);
+                int num_expected;
+                MPI_Get_count(&s, MPI_BYTE, &num_expected);
+
+                STARPU_ASSERT_MSG(num_expected == len, "MPI Master/Slave received a msg with a size of %d Bytes (expected %d Bytes) !", num_expected, len);
+                STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
+        }
+}
+
+static void _starpu_mpi_common_polling_node(struct _starpu_mp_node * node)
+{
+        /* poll the asynchronous messages.*/
+        if (node != NULL)
+        {
+                STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
+                while(node->mp_recv_is_ready(node))
+                {
+                        enum _starpu_mp_command answer;
+                        void *arg;
+                        int arg_size;
+                        answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+                        if(!_starpu_src_common_store_message(node,arg,arg_size,answer))
+                        {
+                                printf("incorrect commande: unknown command or sync command");
+                                STARPU_ASSERT(0);
+                        }
+                }
+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
+        }
+}
+
+/* - In device to device communications, the first ack received by host
+ * is considered as the sender (but it cannot be, in fact, the sender)
+ */
+int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
+{
+        if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
+        {
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_list_begin(event->event.mpi_ms_event.requests);
+                struct _starpu_mpi_ms_event_request * req_next;
+
+                while (req != _starpu_mpi_ms_event_request_list_end(event->event.mpi_ms_event.requests))
+                {
+                        req_next = _starpu_mpi_ms_event_request_list_next(req);
+
+                        int flag = 0;
+                        MPI_Test(&req->request, &flag, MPI_STATUS_IGNORE);
+                        if (flag)
+                        {
+                                _starpu_mpi_ms_event_request_list_erase(event->event.mpi_ms_event.requests, req);
+                                _starpu_mpi_ms_event_request_delete(req);
+
+                                if (event->event.mpi_ms_event.is_sender)
+                                        event->starpu_mp_common_finished_sender--;
+                                else
+                                        event->starpu_mp_common_finished_receiver--;
+
+                        }
+                        req = req_next;
+                }
+
+                /* When the list is empty, we finished to wait each request */
+                if (_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
+                {
+                        /* Destroy the list */
+                        _starpu_mpi_ms_event_request_list_delete(event->event.mpi_ms_event.requests);
+                        event->event.mpi_ms_event.requests = NULL;
+                }
+        }
+
+        _starpu_mpi_common_polling_node(event->polling_node_sender);
+        _starpu_mpi_common_polling_node(event->polling_node_receiver);
+
+        return !event->starpu_mp_common_finished_sender && !event->starpu_mp_common_finished_receiver;
+}
+
+
+/* - In device to device communications, the first ack received by host
+ * is considered as the sender (but it cannot be, in fact, the sender)
+ */
+void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event)
+{
+        if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
+        {
+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_list_begin(event->event.mpi_ms_event.requests);
+                struct _starpu_mpi_ms_event_request * req_next;
+
+                while (req != _starpu_mpi_ms_event_request_list_end(event->event.mpi_ms_event.requests))
+                {
+                        req_next = _starpu_mpi_ms_event_request_list_next(req);
+
+                        MPI_Wait(&req->request, MPI_STATUS_IGNORE);
+                        _starpu_mpi_ms_event_request_list_erase(event->event.mpi_ms_event.requests, req);
+
+                        _starpu_mpi_ms_event_request_delete(req);
+                        req = req_next;
+
+                        if (event->event.mpi_ms_event.is_sender)
+                                event->starpu_mp_common_finished_sender--;
+                        else
+                                event->starpu_mp_common_finished_receiver--;
+
+                }
+
+                STARPU_ASSERT_MSG(_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests), "MPI Request list is not empty after a wait_event !");
+
+                /* Destroy the list */
+                _starpu_mpi_ms_event_request_list_delete(event->event.mpi_ms_event.requests);
+                event->event.mpi_ms_event.requests = NULL;
+        }
+
+        //incoming ack from devices
+        while(event->starpu_mp_common_finished_sender > 0 || event->starpu_mp_common_finished_receiver > 0)
+        {
+                _starpu_mpi_common_polling_node(event->polling_node_sender);
+                _starpu_mpi_common_polling_node(event->polling_node_receiver);
+        }
+}
+
+
+
+void _starpu_mpi_common_barrier(void)
+{
+        MPI_Barrier(MPI_COMM_WORLD);
+}
+
+/* Compute bandwidth and latency between source and sink nodes
+ * Source node has to have the entire set of times at the end
+ */
+void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
+{
+        int ret;
+        unsigned iter;
+
+        int nb_proc, id_proc;
+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
+        MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
+
+        char * buf;
+        _STARPU_MALLOC(buf, SIZE_BANDWIDTH);
+        memset(buf, 0, SIZE_BANDWIDTH);
+
+        unsigned sender, receiver;
+        for(sender = 0; sender < nb_proc; sender++)
+        {
+                for(receiver = 0; receiver < nb_proc; receiver++) 
+                {
+                        MPI_Barrier(MPI_COMM_WORLD);
+
+                        //Node can't be a sender and a receiver
+                        if(sender == receiver)
+                                continue;
+
+                        if(id_proc == sender)
+                        {
+                                double start, end;
+
+                                /* measure bandwidth sender to receiver */
+                                start = starpu_timing_now();
+                                for (iter = 0; iter < NITER; iter++)
+                                {
+                                        ret = MPI_Send(buf, SIZE_BANDWIDTH, MPI_BYTE, receiver, 42, MPI_COMM_WORLD); 
+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
+                                }
+                                end = starpu_timing_now();
+                                bandwidth_dtod[sender][receiver] = (NITER*1000000)/(end - start);
+
+                                /* measure latency sender to receiver */
+                                start = starpu_timing_now();
+                                for (iter = 0; iter < NITER; iter++)
+                                {
+                                        ret = MPI_Send(buf, 1, MPI_BYTE, receiver, 42, MPI_COMM_WORLD); 
+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Latency of MPI Master/Slave cannot be measured !");
+                                }
+                                end = starpu_timing_now();
+                                latency_dtod[sender][receiver] = (end - start)/NITER;
+                        }
+
+                        if (id_proc == receiver)
+                        {
+                                /* measure bandwidth sender to receiver*/
+                                for (iter = 0; iter < NITER; iter++)
+                                {
+                                        ret = MPI_Recv(buf, SIZE_BANDWIDTH, MPI_BYTE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
+                                }
+
+                                /* measure latency sender to receiver */
+                                for (iter = 0; iter < NITER; iter++)
+                                {
+                                        ret = MPI_Recv(buf, 1, MPI_BYTE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
+                                }
+                        }
+                }
+
+                /* When a sender finished its work, it has to send its results to the master */
+
+                /* Sender doesn't need to send to itself its data */
+                if (sender == src_node_id)
+                        continue;
+
+                /* if we are the sender, we send the data */
+                if (sender == id_proc)
+                {
+                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
+                        MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
+                }
+
+                /* the master node receives the data */
+                if (src_node_id == id_proc)
+                {
+                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                        MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                }
+
+        }
+        free(buf);
+}

+ 59 - 0
src/drivers/mpi/driver_mpi_common.h

@@ -0,0 +1,59 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_MPI_COMMON_H__
+#define __DRIVER_MPI_COMMON_H__
+
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mpi/driver_mpi_source.h>
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+
+#define SYNC_TAG 44
+#define ASYNC_TAG 45
+
+int _starpu_mpi_common_mp_init();
+void _starpu_mpi_common_mp_deinit();
+
+int _starpu_mpi_common_is_src_node();
+int _starpu_mpi_common_get_src_node();
+
+int _starpu_mpi_common_is_mp_initialized();
+int _starpu_mpi_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
+
+void _starpu_mpi_common_mp_initialize_src_sink(struct _starpu_mp_node *node);
+
+void _starpu_mpi_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
+void _starpu_mpi_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
+
+void _starpu_mpi_common_mp_send(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
+void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
+
+int _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
+void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event);
+
+void _starpu_mpi_common_barrier(void);
+
+void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS]);
+
+
+#endif  /* STARPU_USE_MPI_MASTER_SLAVE */
+
+#endif	/* __DRIVER_MPI_COMMON_H__ */

+ 81 - 0
src/drivers/mpi/driver_mpi_sink.c

@@ -0,0 +1,81 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <mpi.h>
+#include <dlfcn.h>
+
+#include "driver_mpi_sink.h"
+#include "driver_mpi_source.h"
+#include "driver_mpi_common.h"
+
+void _starpu_mpi_sink_init(struct _starpu_mp_node *node)
+{
+        _starpu_mpi_common_mp_initialize_src_sink(node);
+
+        _STARPU_MALLOC(node->thread_table, sizeof(starpu_pthread_t)*node->nb_cores);
+        //TODO
+}
+
+void _starpu_mpi_sink_deinit(struct _starpu_mp_node *node)
+{
+        free(node->thread_table);
+        //TODO
+}
+
+void (*_starpu_mpi_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
+{
+        void *dl_handle = dlopen(NULL, RTLD_NOW);
+        return dlsym(dl_handle, func_name);
+}
+
+void _starpu_mpi_sink_launch_workers(struct _starpu_mp_node *node)
+{
+        //TODO
+        int i, ret;
+        struct arg_sink_thread * arg;
+        cpu_set_t cpuset;
+        starpu_pthread_attr_t attr;
+        starpu_pthread_t thread;
+
+        for(i=0; i < node->nb_cores; i++)
+        {
+                //init the set
+                CPU_ZERO(&cpuset);
+                CPU_SET(i,&cpuset);
+
+                ret = starpu_pthread_attr_init(&attr);
+                STARPU_ASSERT(ret == 0);
+                ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+                STARPU_ASSERT(ret == 0);
+
+                /*prepare the argument for the thread*/
+                _STARPU_MALLOC(arg, sizeof(struct arg_sink_thread));
+                arg->coreid = i;
+                arg->node = node;
+
+                ret = starpu_pthread_create(&thread, &attr, _starpu_sink_thread, arg);
+                STARPU_ASSERT(ret == 0);
+                ((starpu_pthread_t *)node->thread_table)[i] = thread;
+
+        }
+}
+
+void _starpu_mpi_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core)
+{
+        //TODO
+}

+ 33 - 0
src/drivers/mpi/driver_mpi_sink.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_MPI_SINK_H__
+#define __DRIVER_MPI_SINK_H__
+
+#include <drivers/mp_common/sink_common.h>
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+
+void _starpu_mpi_sink_init(struct _starpu_mp_node *node);
+void _starpu_mpi_sink_deinit(struct _starpu_mp_node *node);
+void _starpu_mpi_sink_launch_workers(struct _starpu_mp_node *node);
+void _starpu_mpi_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core);
+void (*_starpu_mpi_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void);
+
+#endif  /* STARPU_USE_MPI_MASTER_SLAVE */
+
+#endif	/* __DRIVER_MPI_SINK_H__ */

+ 343 - 0
src/drivers/mpi/driver_mpi_source.c

@@ -0,0 +1,343 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <mpi.h>
+#include <errno.h>
+
+#include <starpu.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mpi/driver_mpi_common.h>
+
+#include <datawizard/memory_nodes.h>
+
+#include <drivers/driver_common/driver_common.h>
+#include <drivers/mp_common/source_common.h>
+
+/* Mutex for concurrent access to the table.
+ */
+starpu_pthread_mutex_t htbl_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+
+/* Structure used by host to store informations about a kernel executable on
+ * a MPI MS device : its name, and its address on each device.
+ * If a kernel has been initialized, then a lookup has already been achieved and the
+ * device knows how to call it, else the host still needs to do a lookup.
+ */
+struct _starpu_mpi_ms_kernel
+{
+	UT_hash_handle hh;
+	char *name;
+	starpu_mpi_ms_kernel_t func[STARPU_MAXMPIDEVS];
+} *kernels;
+
+
+/* Array of structures containing all the informations useful to send
+ * and receive informations with devices */
+struct _starpu_mp_node *mpi_ms_nodes[STARPU_MAXMPIDEVS];
+
+void _starpu_mpi_source_init(struct _starpu_mp_node *node)
+{
+        _starpu_mpi_common_mp_initialize_src_sink(node);
+        //TODO
+}
+
+void _starpu_mpi_source_deinit(struct _starpu_mp_node *node)
+{
+
+}
+
+struct _starpu_mp_node *_starpu_mpi_src_get_mp_node_from_memory_node(int memory_node)
+{
+        int devid = _starpu_memory_node_get_devid(memory_node);
+        STARPU_ASSERT_MSG(devid >= 0 && devid < STARPU_MAXMPIDEVS, "bogus devid %d for memory node %d\n", devid, memory_node);
+
+        return mpi_ms_nodes[devid];
+}
+
+int _starpu_mpi_src_allocate_memory(void ** addr, size_t size, unsigned memory_node)
+{
+        const struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(memory_node);
+        return _starpu_src_common_allocate(mp_node, addr, size);
+}
+
+void _starpu_mpi_source_free_memory(void *addr, unsigned memory_node)
+{
+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(memory_node);
+        _starpu_src_common_free(mp_node, addr);
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */
+int _starpu_mpi_copy_ram_to_mpi_sync(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+{
+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(dst_node);
+        return _starpu_src_common_copy_host_to_sink_sync(mp_node, src, dst, size);
+}   
+ 
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */    
+int _starpu_mpi_copy_mpi_to_ram_sync(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+{
+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(src_node);
+        return _starpu_src_common_copy_sink_to_host_sync(mp_node, src, dst, size);
+}   
+
+int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size)
+{
+        return _starpu_src_common_copy_sink_to_sink_sync(_starpu_mpi_src_get_mp_node_from_memory_node(src_node),
+                        _starpu_mpi_src_get_mp_node_from_memory_node(dst_node),
+                        src, dst, size);
+}
+
+int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event)
+{
+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(src_node);
+        return _starpu_src_common_copy_sink_to_host_async(mp_node, src, dst, size, event);
+}
+
+int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event)
+{
+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(dst_node);
+        return _starpu_src_common_copy_host_to_sink_async(mp_node, src, dst, size, event);
+}
+
+int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event)
+{
+        return _starpu_src_common_copy_sink_to_sink_async(_starpu_mpi_src_get_mp_node_from_memory_node(src_node),
+                        _starpu_mpi_src_get_mp_node_from_memory_node(dst_node),
+                        src, dst, size, event);
+}
+
+
+int _starpu_mpi_ms_src_register_kernel(starpu_mpi_ms_func_symbol_t *symbol, const char *func_name)
+{
+        unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
+
+        STARPU_PTHREAD_MUTEX_LOCK(&htbl_mutex);
+        struct _starpu_mpi_ms_kernel *kernel;
+
+        HASH_FIND_STR(kernels, func_name, kernel);
+
+        if (kernel != NULL)
+        {
+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+                // Function already in the table.
+                *symbol = kernel;
+                return 0;
+        }
+
+        kernel = malloc(sizeof(*kernel));
+        if (kernel == NULL)
+        {
+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+                return -ENOMEM;
+        }
+
+        kernel->name = malloc(func_name_size);
+        if (kernel->name == NULL)
+        {
+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+                free(kernel);
+                return -ENOMEM;
+        }
+
+        memcpy(kernel->name, func_name, func_name_size);
+
+        HASH_ADD_STR(kernels, name, kernel);
+
+        unsigned int nb_mpi_devices = _starpu_mpi_src_get_device_count();
+        unsigned int i;
+        for (i = 0; i < nb_mpi_devices; ++i)
+                kernel->func[i] = NULL;
+
+        STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+
+        *symbol = kernel;
+
+        return 0;
+}
+
+
+starpu_mpi_ms_kernel_t _starpu_mpi_ms_src_get_kernel(starpu_mpi_ms_func_symbol_t symbol)
+{
+        int workerid = starpu_worker_get_id();
+
+        /* This function has to be called in the codelet only, by the thread
+         * which will handle the task */
+        if (workerid < 0)
+                return NULL;
+
+        int devid = starpu_worker_get_devid(workerid);
+
+        struct _starpu_mpi_ms_kernel *kernel = symbol;
+
+        if (kernel->func[devid] == NULL)
+        {
+                struct _starpu_mp_node *node = mpi_ms_nodes[devid];
+                int ret = _starpu_src_common_lookup(node, (void (**)(void))&kernel->func[devid], kernel->name);
+                if (ret)
+                        return NULL;
+        }
+
+        return kernel->func[devid];
+}
+
+void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void)
+{
+        starpu_mpi_ms_kernel_t kernel = NULL;
+
+        starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(j->task->cl, j->nimpl);
+        if (func)
+        {
+                /* We execute the function contained in the codelet, it must return a
+                 * pointer to the function to execute on the device, either specified
+                 * directly by the user or by a call to starpu_mpi_ms_get_func().
+                 */
+                kernel = func();
+        }
+        else
+        {
+                /* If user dont define any starpu_mpi_ms_fun_t in cl->mpi_ms_func we try to use
+                 * cpu_func_name.
+                 */
+                const char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+                if (func_name)
+                {
+                        starpu_mpi_ms_func_symbol_t symbol;
+
+                        _starpu_mpi_ms_src_register_kernel(&symbol, func_name);
+
+                        kernel = _starpu_mpi_ms_src_get_kernel(symbol);
+                }
+        }
+        STARPU_ASSERT(kernel);
+
+        return (void (*)(void))kernel;
+}
+
+unsigned _starpu_mpi_src_get_device_count()
+{
+        int nb_mpi_devices;
+
+        if (!_starpu_mpi_common_is_mp_initialized())
+                return 0;
+
+        MPI_Comm_size(MPI_COMM_WORLD, &nb_mpi_devices);
+
+        //Remove one for master
+        nb_mpi_devices = nb_mpi_devices - 1;
+
+        return nb_mpi_devices;
+
+}
+
+void *_starpu_mpi_src_worker(void *arg)
+{
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+        struct _starpu_worker_set *worker_set_mpi = (struct _starpu_worker_set *) arg;
+        int nbsinknodes = _starpu_mpi_src_get_device_count();
+
+        int workersetnum;
+        for (workersetnum = 0; workersetnum < nbsinknodes; workersetnum++)
+        {
+                struct _starpu_worker_set * worker_set = &worker_set_mpi[workersetnum];
+#else
+                struct _starpu_worker_set *worker_set = arg;
+#endif
+
+                /* As all workers of a set share common data, we just use the first
+                 *       * one for intializing the following stuffs. */
+                struct _starpu_worker *baseworker = &worker_set->workers[0];
+                struct _starpu_machine_config *config = baseworker->config;
+                unsigned baseworkerid = baseworker - config->workers;
+                unsigned devid = baseworker->devid;
+                unsigned i;
+
+                /* unsigned memnode = baseworker->memory_node; */
+
+                _starpu_driver_start(baseworker, _STARPU_FUT_MPI_KEY, 0);
+
+#ifdef STARPU_USE_FXT             
+                for (i = 1; i < worker_set->nworkers; i++)
+                        _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_MPI_KEY, 0);
+#endif          
+
+                // Current task for a thread managing a worker set has no sense.
+                _starpu_set_current_task(NULL);
+
+                for (i = 0; i < config->topology.nmpicores[devid]; i++)
+                {
+                        struct _starpu_worker *worker = &config->workers[baseworkerid+i];
+                        snprintf(worker->name, sizeof(worker->name), "MPI_MS %d core %u", devid, i);
+                        snprintf(worker->short_name, sizeof(worker->short_name), "MPI_MS %d.%u", devid, i);
+                }
+
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+                {
+                        char thread_name[16];
+                        snprintf(thread_name, sizeof(thread_name), "MPI_MS");
+                        starpu_pthread_setname(thread_name);
+                }
+#else
+                {
+                        char thread_name[16];
+                        snprintf(thread_name, sizeof(thread_name), "MPI_MS %d", devid);
+                        starpu_pthread_setname(thread_name);
+                }
+#endif
+
+                for (i = 0; i < worker_set->nworkers; i++)
+                {
+                        struct _starpu_worker *worker = &worker_set->workers[i];
+                        _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
+                }
+
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+                _starpu_src_common_init_switch_env(workersetnum);
+        }  /* for */
+
+        /* set the worker zero for the main thread */
+        for (workersetnum = 0; workersetnum < nbsinknodes; workersetnum++)
+        {
+                struct _starpu_worker_set * worker_set = &worker_set_mpi[workersetnum];
+                struct _starpu_worker *baseworker = &worker_set->workers[0];
+#endif
+
+                /* tell the main thread that this one is ready */
+                STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+                baseworker->status = STATUS_UNKNOWN;
+                worker_set->set_is_initialized = 1;
+                STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
+
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+        }
+#endif
+
+
+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
+        _starpu_src_common_workers_set(worker_set_mpi, nbsinknodes, mpi_ms_nodes);
+#else
+        _starpu_src_common_worker(worker_set, baseworkerid, mpi_ms_nodes[devid]);
+#endif
+
+        return NULL;
+
+
+}

+ 52 - 0
src/drivers/mpi/driver_mpi_source.h

@@ -0,0 +1,52 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_MPI_SOURCE_H__
+#define __DRIVER_MPI_SOURCE_H__
+
+#include <drivers/mp_common/mp_common.h>
+#include <starpu_mpi_ms.h>
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+
+/* Array of structures containing all the informations useful to send
+ * and receive informations with devices */
+extern struct _starpu_mp_node *mpi_ms_nodes[STARPU_MAXMPIDEVS];
+struct _starpu_mp_node *_starpu_mpi_src_get_mp_node_from_memory_node(int memory_node);
+
+unsigned _starpu_mpi_src_get_device_count();
+void *_starpu_mpi_src_worker(void *arg);
+
+void _starpu_mpi_source_init(struct _starpu_mp_node *node);
+void _starpu_mpi_source_deinit(struct _starpu_mp_node *node);
+
+int _starpu_mpi_src_allocate_memory(void ** addr, size_t size, unsigned memory_node);
+void _starpu_mpi_source_free_memory(void *addr, unsigned memory_node);
+
+int _starpu_mpi_copy_mpi_to_ram_sync(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
+int _starpu_mpi_copy_ram_to_mpi_sync(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
+int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size);
+
+int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event);
+int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
+int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
+
+void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);
+
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+
+#endif	/* __DRIVER_MPI_SOURCE_H__ */

+ 2 - 4
src/drivers/opencl/driver_opencl.c

@@ -756,15 +756,13 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 	if (!idle)
 	{
 		/* Not ready yet, no better thing to do than waiting */
-		__starpu_datawizard_progress(memnode, 1, 0);
-		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
+		__starpu_datawizard_progress(1, 0);
 		return 0;
 	}
 #endif
 
 	res = !idle;
-	res |= __starpu_datawizard_progress(memnode, 1, 1);
-	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
+	res |= __starpu_datawizard_progress(1, 1);
 
 	task = _starpu_get_worker_task(worker, workerid, memnode);
 

+ 6 - 2
src/drivers/scc/driver_scc_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -141,6 +141,8 @@ void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int
 {
 	int ret;
 
+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
+
 	/* There are potentially 48 threads running on the master core and RCCE_send write
 	 * data in the MPB associated to this core. It's not thread safe, so we have to protect it.
 	 * RCCE_acquire_lock uses a test&set register on SCC. */
@@ -155,8 +157,10 @@ void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int
 	RCCE_release_lock(RCCE_ue());
 }
 
-void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
 {
+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
+
 	int ret;
 	if ((ret = RCCE_recv(msg, len, node->mp_connection.scc_nodeid)) != RCCE_SUCCESS)
 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);

+ 3 - 3
src/drivers/scc/driver_scc_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,8 +39,8 @@ int _starpu_scc_common_is_mp_initialized();
 int _starpu_scc_common_get_src_node_id();
 int _starpu_scc_common_is_src_node();
 
-void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len);
-void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
 
 void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
 

+ 7 - 1
src/drivers/scc/driver_scc_sink.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -58,6 +58,9 @@ void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
 void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len)
 {
 	int ret;
+
+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
+
 	if ((ret = RCCE_send(msg, len, STARPU_TO_SCC_SINK_ID(dst_devid))) != RCCE_SUCCESS)
 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
 }
@@ -65,6 +68,9 @@ void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst
 void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len)
 {
 	int ret;
+
+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
+
 	if ((ret = RCCE_recv(msg, len, STARPU_TO_SCC_SINK_ID(src_devid))) != RCCE_SUCCESS)
 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
 }

+ 3 - 3
src/drivers/scc/driver_scc_sink.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,8 +28,8 @@ void _starpu_scc_sink_init(struct _starpu_mp_node *node);
 void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node);
 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
 
-void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
-void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
+void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
+void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
 
 void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, starpu_pthread_t *thread);
 

+ 4 - 4
src/drivers/scc/driver_scc_source.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  INRIA
+ * Copyright (C) 2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -259,7 +259,7 @@ void _starpu_scc_set_offset_in_shared_memory(void *ptr, void **dev_handle, size_
  */
 int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
 {
-	return _starpu_src_common_copy_host_to_sink(_starpu_scc_src_memory_node_to_mp_node(dst_node),
+	return _starpu_src_common_copy_host_to_sink_sync(_starpu_scc_src_memory_node_to_mp_node(dst_node),
 			src, dst, size);
 }
 
@@ -268,13 +268,13 @@ int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_U
  */
 int _starpu_scc_copy_sink_to_src(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
 {
-	return _starpu_src_common_copy_sink_to_host(_starpu_scc_src_memory_node_to_mp_node(src_node),
+	return _starpu_src_common_copy_sink_to_host_sync(_starpu_scc_src_memory_node_to_mp_node(src_node),
 			src, dst, size);
 }
 
 int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size)
 {
-	return _starpu_src_common_copy_sink_to_sink(_starpu_scc_src_memory_node_to_mp_node(src_node),
+	return _starpu_src_common_copy_sink_to_sink_sync(_starpu_scc_src_memory_node_to_mp_node(src_node),
 			_starpu_scc_src_memory_node_to_mp_node(dst_node),
 			src, dst, size);
 }

+ 1 - 0
src/starpu_parameters.h

@@ -27,4 +27,5 @@
 #define _STARPU_OPENCL_ALPHA	12.22f
 #define _STARPU_MIC_ALPHA	0.5f
 #define _STARPU_SCC_ALPHA	1.0f
+#define _STARPU_MPI_MS_ALPHA	1.0f
 #endif /* _STARPU_PARAMETERS_H */

+ 4 - 0
src/top/starpu_top.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony Roy
  * Copyright (C) 2011, 2012, 2013, 2016 CNRS
+ * Copyright (C) 2016  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -111,6 +112,9 @@ static void starpu_top_get_device_type(int id, char* type)
 	case STARPU_SCC_WORKER:
 		strncpy(type, "SCC", 9);
 		break;
+	case STARPU_MPI_WORKER:
+		strncpy(type, "MPI", 9);
+		break;
 	}
 	type[9] = 0;
 }

+ 7 - 3
tests/Makefile.am

@@ -110,11 +110,15 @@ LOADER			=
 LOADER_BIN		=	$(top_builddir)/tests/loader-cross.sh
 endif
 
+if STARPU_USE_MPI_MASTER_SLAVE
+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
+endif
+
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
-LOG_COMPILER		=	$(LOADER_BIN)
+TESTS_ENVIRONMENT   =   top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER        =   $(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
+TESTS_ENVIRONMENT   =   top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
 

+ 2 - 1
tests/datawizard/copy.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2011, 2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -64,7 +65,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0 &&
-		starpu_worker_get_count_by_type(STARPU_MIC_WORKER) == 0)
+		starpu_worker_get_count_by_type(STARPU_MIC_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_MPI_WORKER) == 0)
 	{
 		FPRINTF(stderr, "This application requires a CUDA , OpenCL or MIC Worker\n");
 		starpu_shutdown();

+ 1 - 0
tests/datawizard/manual_reduction.c

@@ -175,6 +175,7 @@ int main(int argc, char **argv)
 
 	starpu_conf_init(&conf);
 	conf.nmic = 0;
+	conf.nmpi_ms = 0;
 	conf.nscc = 0;
 
 	variable = INIT_VALUE;

+ 19 - 14
tests/errorcheck/starpu_init_noworker.c

@@ -57,22 +57,27 @@ int main(int argc, char **argv)
 	conf.nopencl = 0;
 	conf.nmic = 0;
 	conf.nscc = 0;
+        conf.nmpi_ms = 0;
 
 	/* starpu_init should return -ENODEV */
-	ret = starpu_initialize(&conf, &argc, &argv);
-	if (ret == -ENODEV)
-	     return EXIT_SUCCESS;
-	else
-	{
-	     	unsigned ncpu = starpu_cpu_worker_get_count();
-		unsigned ncuda = starpu_cuda_worker_get_count();
-		unsigned nopencl = starpu_opencl_worker_get_count();
-		FPRINTF(stderr, "StarPU has found :\n");
-		FPRINTF(stderr, "\t%u CPU cores\n", ncpu);
-		FPRINTF(stderr, "\t%u CUDA devices\n", ncuda);
-		FPRINTF(stderr, "\t%u OpenCL devices\n", nopencl);
-		return EXIT_FAILURE;
-	}
+        ret = starpu_initialize(&conf, &argc, &argv);
+        if (ret == -ENODEV)
+                return EXIT_SUCCESS;
+        else
+        {
+                unsigned ncpu = starpu_cpu_worker_get_count();
+                unsigned ncuda = starpu_cuda_worker_get_count();
+                unsigned nopencl = starpu_opencl_worker_get_count();
+                unsigned nmic = starpu_mic_worker_get_count();
+                unsigned nmpi_ms = starpu_mpi_ms_worker_get_count();
+                FPRINTF(stderr, "StarPU has found :\n");
+                FPRINTF(stderr, "\t%u CPU cores\n", ncpu);
+                FPRINTF(stderr, "\t%u CUDA devices\n", ncuda);
+                FPRINTF(stderr, "\t%u OpenCL devices\n", nopencl);
+                FPRINTF(stderr, "\t%u MIC devices\n", nmic);
+                FPRINTF(stderr, "\t%u MPI Master-Slaves devices\n", nmpi_ms);
+                return EXIT_FAILURE;
+        }
 
 
 }

+ 1 - 0
tests/perfmodels/valid_model.c

@@ -105,6 +105,7 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
 	// We need to call starpu_init again to initialise values used by perfmodels
 	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	char path[256];