8 years ago · 64bc8c56a9
--- a/Makefile.am
+++ b/Makefile.am
@@ -82,6 +82,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_openmp.h			\
			
 
				 	include/starpu_sink.h			\
			
 
				 	include/starpu_mic.h			\
			
 
				+	include/starpu_mpi_ms.h			\
			
 
				 	include/starpu_scc.h			\
			
 
				 	include/starpu_expert.h			\
			
 
				 	include/starpu_profiling.h		\
			
--- a/configure.ac
+++ b/configure.ac
@@ -78,6 +78,8 @@ AC_PROG_SED
 
				 AC_PROG_LN_S
			
 
				 AC_PROG_F77
			
 
				 AC_PROG_FC
			
 
				+AC_PROG_GREP
			
 
				+AC_PROG_EGREP
			
 
				 AC_CHECK_PROGS(PROG_STAT,gstat stat)
			
 
				 AC_CHECK_PROGS(PROG_DATE,gdate date)
			
 
				 AC_OPENMP
			
@@ -94,6 +96,306 @@ if test x$enable_perf_debug = xyes; then
 
				     enable_shared=no
			
 
				 fi
			
 
				 
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                 Drivers                                     #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
			
 
				+				[Enable the use of an OpenCL simulator])],
			
 
				+				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
			
 
				+if test x$enable_opencl_simulator = xyes; then
			
 
				+	enable_simgrid=yes
			
 
				+	AC_DEFINE(STARPU_OPENCL_SIMULATOR, [1], [Define this to enable using an OpenCL simulator])
			
 
				+fi
			
 
				+
			
 
				+AC_ARG_WITH(simgrid-dir,
			
 
				+	[AS_HELP_STRING([--with-simgrid-dir=<path>],
			
 
				+	[specify SimGrid installation directory])],
			
 
				+	[
			
 
				+		simgrid_dir="$withval"
			
 
				+		# in case this was not explicit yet
			
 
				+		enable_simgrid=yes
			
 
				+	], simgrid_dir=no)
			
 
				+
			
 
				+AC_ARG_WITH(simgrid-include-dir,
			
 
				+	[AS_HELP_STRING([--with-simgrid-include-dir=<path>],
			
 
				+	[specify where SimGrid headers are installed])],
			
 
				+	[
			
 
				+		simgrid_include_dir="$withval"
			
 
				+		# in case this was not explicit yet
			
 
				+		enable_simgrid=yes
			
 
				+	], [simgrid_include_dir=no])
			
 
				+
			
 
				+AC_ARG_WITH(simgrid-lib-dir,
			
 
				+	[AS_HELP_STRING([--with-simgrid-lib-dir=<path>],
			
 
				+	[specify where SimGrid libraries are installed])],
			
 
				+	[
			
 
				+		simgrid_lib_dir="$withval"
			
 
				+		# in case this was not explicit yet
			
 
				+		enable_simgrid=yes
			
 
				+	], [simgrid_lib_dir=no])
			
 
				+
			
 
				+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
			
 
				+			[Enable simulating execution in simgrid])],
			
 
				+			enable_simgrid=$enableval, enable_simgrid=no)
			
 
				+if test x$enable_simgrid = xyes ; then
			
 
				+   	if test -n "$SIMGRID_CFLAGS" ; then
			
 
				+	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
			
 
				+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
			
 
				+	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
			
 
				+	fi
			
 
				+	if test -n "$SIMGRID_LIBS" ; then
			
 
				+		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
			
 
				+	fi
			
 
				+	if test "$simgrid_dir" != "no" ; then
			
 
				+	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
			
 
				+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
			
 
				+	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
			
 
				+	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
			
 
				+	fi
			
 
				+	if test "$simgrid_include_dir" != "no" ; then
			
 
				+	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
			
 
				+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
			
 
				+	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
			
 
				+	fi
			
 
				+	if test "$simgrid_lib_dir" != "no" ; then
			
 
				+	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
			
 
				+	fi
			
 
				+	AC_HAVE_LIBRARY([simgrid], [],
			
 
				+		[
			
 
				+			AC_MSG_ERROR(Simgrid support needs simgrid installed)
			
 
				+		]
			
 
				+	)
			
 
				+	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
			
 
				+	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
			
 
				+	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
			
 
				+   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
			
 
				+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
			
 
				+	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
			
 
				+	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
			
 
				+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
			
 
				+		    		[[
			
 
				+#ifdef STARPU_HAVE_SIMGRID_MSG_H
			
 
				+#include <simgrid/msg.h>
			
 
				+#else
			
 
				+#include <msg/msg.h>
			
 
				+#endif
			
 
				+				 ]],
			
 
				+				[[msg_host_t foo; ]]
			
 
				+			    )],
			
 
				+	                 [],
			
 
				+	                 [
			
 
				+			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
			
 
				+		         ])
			
 
				+	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
			
 
				+	# We won't bind or detect anything
			
 
				+	with_hwloc=no
			
 
				+
			
 
				+	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
			
 
				+	AC_LANG_PUSH([C++])
			
 
				+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
			
 
				+			  #ifdef HAVE_SIMGRID_MSG_H
			
 
				+			  #include <simgrid/msg.h>
			
 
				+			  #include <simgrid/host.h>
			
 
				+			  #else
			
 
				+			  #include <msg/msg.h>
			
 
				+			  #endif
			
 
				+			  ]])],,
			
 
				+			  CXXFLAGS="-std=c++11 $CXXFLAGS"
			
 
				+			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
			
 
				+	AC_LANG_POP([C++])
			
 
				+fi
			
 
				+AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
			
 
				+AC_SUBST(SIMGRID_CFLAGS)
			
 
				+AC_SUBST(SIMGRID_LIBS)
			
 
				+AC_MSG_CHECKING(whether SimGrid is enabled)
			
 
				+AC_MSG_RESULT($enable_simgrid)
			
 
				+
			
 
				+AC_MSG_CHECKING(whether blocking drivers should be enabled)
			
 
				+AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
			
 
				+				enable_blocking=$enableval, enable_blocking=$enable_simgrid)
			
 
				+AC_MSG_RESULT($enable_blocking)
			
 
				+
			
 
				+if test x$enable_blocking = xno ; then
			
 
				+	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
			
 
				+fi
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                    MPI                                      #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
			
 
				+                              [Disable StarPU MPI library generation])],
			
 
				+            [enable_mpi=$enableval],
			
 
				+            [enable_mpi=yes])
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
			
 
				+                              [Enable StarPU to run with the master-slave mode])],
			
 
				+            use_mpi_master_slave=$enableval,
			
 
				+            use_mpi_master_slave=no)
			
 
				+
			
 
				+#Check MPICC
			
 
				+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
			
 
				+           [Path of the mpicc compiler])],
			
 
				+   [
			
 
				+       if test x$withval = xyes; then
			
 
				+           AC_MSG_ERROR(--with-mpicc must be given a pathname)
			
 
				+       else
			
 
				+           mpicc_path=$withval
			
 
				+       fi
			
 
				+   ],
			
 
				+   [
			
 
				+       if test x$enable_simgrid = xyes ; then
			
 
				+           DEFAULT_MPICC=smpicc
			
 
				+       else
			
 
				+           DEFAULT_MPICC=mpicc
			
 
				+       fi
			
 
				+       # nothing was specified: default value is used
			
 
				+       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+   ])
			
 
				+
			
 
				+# We test if the MPICC compiler exists
			
 
				+if test ! -x $mpicc_path; then
			
 
				+    #MPICC does not exists or is not executable
			
 
				+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				+    use_mpi=no
			
 
				+else
			
 
				+    use_mpi=yes
			
 
				+    if test x$enable_simgrid = xyes ; then
			
 
				+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
			
 
				+                    [Path of the smpirun helper])],
			
 
				+            [
			
 
				+                if test x$withval = xyes; then
			
 
				+                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
			
 
				+                else
			
 
				+                    smpirun_path=$withval
			
 
				+                fi
			
 
				+            ],
			
 
				+            [
			
 
				+                # nothing was specified: default value is used
			
 
				+                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
			
 
				+            ])
			
 
				+
			
 
				+    fi
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(mpicc path)
			
 
				+AC_MSG_RESULT($mpicc_path)
			
 
				+AC_SUBST(MPICC, $mpicc_path)
			
 
				+
			
 
				+
			
 
				+#Check MPICXX/MPIC++
			
 
				+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
			
 
				+           [Path of the mpicxx/mpic++ compiler])],
			
 
				+   [
			
 
				+       if test x$withval = xyes; then
			
 
				+           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
			
 
				+       else
			
 
				+           mpicxx_path=$withval
			
 
				+       fi
			
 
				+   ],
			
 
				+   [
			
 
				+       if test x$enable_simgrid = xyes ; then
			
 
				+           DEFAULT_MPICXX=smpicxx
			
 
				+       else
			
 
				+           DEFAULT_MPICXX=mpicxx
			
 
				+       fi
			
 
				+       # nothing was specified: default value is used
			
 
				+       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+       
			
 
				+       # try with mpic++ if mpicxx was not found
			
 
				+       if test x$mpicxx_path = xno ; then
			
 
				+            DEFAULT_MPICXX=mpic++
			
 
				+            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+       fi
			
 
				+   ])
			
 
				+
			
 
				+# We test if the MPICXX/MPIC++ compiler exists
			
 
				+if test ! -x $mpicxx_path; then
			
 
				+    #MPICXX/MPIC++ does not exists or is not executable
			
 
				+    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
			
 
				+    use_mpicxx=no
			
 
				+else
			
 
				+    use_mpicxx=yes
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(mpicxx/mpic++ path)
			
 
				+AC_MSG_RESULT($mpicxx_path)
			
 
				+AC_SUBST(MPICXX, $mpicxx_path)
			
 
				+
			
 
				+
			
 
				+if test x$use_mpi = xyes -a \( x$enable_mpi = xyes -o x$use_mpi_master_slave = xyes \) ; then
			
 
				+    cc_or_mpicc=$mpicc_path
			
 
				+        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				+        # libstarpumpi.
			
 
				+        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				+        # references to MPI_*). We manually add the required flags to fix this
			
 
				+        # issue.
			
 
				+        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				+else
			
 
				+    cc_or_mpicc=$CC
			
 
				+fi
			
 
				+
			
 
				+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
			
 
				+				   [Enable StarPU MPI activity polling method])],
			
 
				+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
			
 
				+if  test x$enable_mpi_progression_hook = xyes; then
			
 
				+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
			
 
				+fi
			
 
				+
			
 
				+#We can only build MPI Master Slave if User wants it and MPI is available
			
 
				+if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
			
 
				+    build_mpi_master_slave=yes
			
 
				+else
			
 
				+    build_mpi_master_slave=no
			
 
				+fi
			
 
				+
			
 
				+#Warn users that they cannot use both at the same time
			
 
				+if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
			
 
				+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
			
 
				+fi
			
 
				+
			
 
				+if test x$build_mpi_master_slave = xyes; then
			
 
				+    AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
			
 
				+    CC=$mpicc_path    
			
 
				+    CCLD=$mpicc_path      
			
 
				+    CXX=$mpicxx_path      
			
 
				+    CXXLD=mpicxx_path    
			
 
				+fi
			
 
				+
			
 
				+AC_ARG_WITH(mpi-master-slave-multiple-thread, [AS_HELP_STRING([--with-mpi-master-slave-multiple-thread])],
			
 
				+	[AC_DEFINE([STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD], [1], [Use multiple threads to communicate with slaves])])
			
 
				+
			
 
				+AC_MSG_CHECKING(whether the master-slave mode should be enabled)
			
 
				+AC_MSG_RESULT($build_mpi_master_slave)
			
 
				+AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
			
 
				+
			
 
				+AC_MSG_CHECKING(maximum number of MPI master-slave devices)
			
 
				+AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
			
 
				+			[maximum number of MPI master-slave devices])],
			
 
				+			nmaxmpidev=$enableval,
			
 
				+            [
			
 
				+             if test x$build_mpi_master_slave = xyes; then
			
 
				+                 nmaxmpidev=4
			
 
				+             else
			
 
				+                 nmaxmpidev=0
			
 
				+             fi
			
 
				+            ])
			
 
				+AC_MSG_RESULT($nmaxmpidev)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                LIBTOOLS                                     #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				 LT_PREREQ([2.2])
			
 
				 LT_INIT([win32-dll])
			
 
				 
			
@@ -139,6 +441,85 @@ AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				+#                       Miscellaneous things for MPI                          #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
			
 
				+AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
			
 
				+running_mpi_check=no
			
 
				+if test $svndir = 1 -o -d "$srcdir/.git" ; then
			
 
				+    running_mpi_check=yes
			
 
				+fi
			
 
				+if test x$enable_mpi_check = xyes ; then
			
 
				+    running_mpi_check=yes
			
 
				+fi
			
 
				+if test x$enable_mpi_check = xno ; then
			
 
				+    running_mpi_check=no
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+# Check if mpiexec is available
			
 
				+AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
			
 
				+            [Path of mpiexec])],
			
 
				+    [
			
 
				+        if test x$withval = xyes; then
			
 
				+            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
			
 
				+        else
			
 
				+            mpiexec_path=$withval
			
 
				+        fi
			
 
				+    ],
			
 
				+    [
			
 
				+        # nothing was specified: look in the path
			
 
				+        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
			
 
				+    ])
			
 
				+
			
 
				+AC_MSG_CHECKING(whether mpiexec is available)
			
 
				+AC_MSG_RESULT($mpiexec_path)
			
 
				+
			
 
				+# We test if MPIEXEC exists
			
 
				+if test ! -x $mpiexec_path; then
			
 
				+    #MPIEXEC does not exists or is not executable
			
 
				+    AC_MSG_RESULT(The mpiexec script is not valid)
			
 
				+        running_mpi_check=no
			
 
				+        mpiexec_path=""
			
 
				+fi
			
 
				+
			
 
				+AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
			
 
				+if test x$use_mpi = xyes ; then
			
 
				+    AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				+    AC_MSG_RESULT($running_mpi_check)
			
 
				+    AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				+fi
			
 
				+
			
 
				+#We can only build StarPU MPI Library if User wants it and MPI is available
			
 
				+if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
			
 
				+    build_mpi_lib=yes
			
 
				+else
			
 
				+    build_mpi_lib=no
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
			
 
				+AC_MSG_RESULT($build_mpi_lib)
			
 
				+
			
 
				+AC_SUBST(USE_MPI, $build_mpi_lib)
			
 
				+AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
			
 
				+if test x$build_mpi_lib = xyes; then
			
 
				+	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
			
 
				+else
			
 
				+	running_mpi_check=no
			
 
				+fi
			
 
				+
			
 
				+AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
			
 
				+			[Arguments for mpiexec])],
			
 
				+	[
			
 
				+		mpiexec_args=$withval
			
 
				+	])
			
 
				+AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
			
 
				+
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				 #                           MIC device compilation                            #
			
 
				 #   (Must be done in beginning to change prefix in the whole configuration)   #
			
 
				 #                                                                             #
			
@@ -1021,143 +1402,30 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
 
				    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
			
 
				 fi
			
 
				 
			
 
				-AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
			
 
				-AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
			
 
				-			[disable asynchronous copy between CPU and MIC devices])],
			
 
				-			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
			
 
				-disable_asynchronous_mic_copy=no
			
 
				-if test x$enable_asynchronous_mic_copy = xno ; then
			
 
				-   disable_asynchronous_mic_copy=yes
			
 
				-fi
			
 
				-AC_MSG_RESULT($disable_asynchronous_mic_copy)
			
 
				-if test x$disable_asynchronous_mic_copy = xyes ; then
			
 
				-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
			
 
				-fi
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				-#                                 Drivers                                     #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-
			
 
				-AC_ARG_ENABLE(opencl-simulator, [AS_HELP_STRING([--enable-opencl-simulator],
			
 
				-				[Enable the use of an OpenCL simulator])],
			
 
				-				enable_opencl_simulator=$enableval, enable_opencl_simulator=no)
			
 
				-if test x$enable_opencl_simulator = xyes; then
			
 
				-	enable_simgrid=yes
			
 
				-	AC_DEFINE(STARPU_OPENCL_SIMULATOR, [1], [Define this to enable using an OpenCL simulator])
			
 
				-fi
			
 
				-
			
 
				-AC_ARG_WITH(simgrid-dir,
			
 
				-	[AS_HELP_STRING([--with-simgrid-dir=<path>],
			
 
				-	[specify SimGrid installation directory])],
			
 
				-	[
			
 
				-		simgrid_dir="$withval"
			
 
				-		# in case this was not explicit yet
			
 
				-		enable_simgrid=yes
			
 
				-	], simgrid_dir=no)
			
 
				-
			
 
				-AC_ARG_WITH(simgrid-include-dir,
			
 
				-	[AS_HELP_STRING([--with-simgrid-include-dir=<path>],
			
 
				-	[specify where SimGrid headers are installed])],
			
 
				-	[
			
 
				-		simgrid_include_dir="$withval"
			
 
				-		# in case this was not explicit yet
			
 
				-		enable_simgrid=yes
			
 
				-	], [simgrid_include_dir=no])
			
 
				-
			
 
				-AC_ARG_WITH(simgrid-lib-dir,
			
 
				-	[AS_HELP_STRING([--with-simgrid-lib-dir=<path>],
			
 
				-	[specify where SimGrid libraries are installed])],
			
 
				-	[
			
 
				-		simgrid_lib_dir="$withval"
			
 
				-		# in case this was not explicit yet
			
 
				-		enable_simgrid=yes
			
 
				-	], [simgrid_lib_dir=no])
			
 
				-
			
 
				-AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
			
 
				-			[Enable simulating execution in simgrid])],
			
 
				-			enable_simgrid=$enableval, enable_simgrid=no)
			
 
				-if test x$enable_simgrid = xyes ; then
			
 
				-   	if test -n "$SIMGRID_CFLAGS" ; then
			
 
				-	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
			
 
				-	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
			
 
				-	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
			
 
				-	fi
			
 
				-	if test -n "$SIMGRID_LIBS" ; then
			
 
				-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
			
 
				-	fi
			
 
				-	if test "$simgrid_dir" != "no" ; then
			
 
				-	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
			
 
				-	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
			
 
				-	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
			
 
				-	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
			
 
				-	fi
			
 
				-	if test "$simgrid_include_dir" != "no" ; then
			
 
				-	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
			
 
				-	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
			
 
				-	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
			
 
				-	fi
			
 
				-	if test "$simgrid_lib_dir" != "no" ; then
			
 
				-	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
			
 
				-	fi
			
 
				-	AC_HAVE_LIBRARY([simgrid], [],
			
 
				-		[
			
 
				-			AC_MSG_ERROR(Simgrid support needs simgrid installed)
			
 
				-		]
			
 
				-	)
			
 
				-	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
			
 
				-	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
			
 
				-	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
			
 
				-   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
			
 
				-	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
			
 
				-	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
			
 
				-	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
			
 
				-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
			
 
				-		    		[[
			
 
				-#ifdef STARPU_HAVE_SIMGRID_MSG_H
			
 
				-#include <simgrid/msg.h>
			
 
				-#else
			
 
				-#include <msg/msg.h>
			
 
				-#endif
			
 
				-				 ]],
			
 
				-				[[msg_host_t foo; ]]
			
 
				-			    )],
			
 
				-	                 [],
			
 
				-	                 [
			
 
				-			   AC_MSG_ERROR(StarPU needs a version of Simgrid which defines the type msg_host_t (should be any version >= 3.8.1))
			
 
				-		         ])
			
 
				-	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
			
 
				-	# We won't bind or detect anything
			
 
				-	with_hwloc=no
			
 
				-
			
 
				-	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
			
 
				-	AC_LANG_PUSH([C++])
			
 
				-	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
			
 
				-			  #ifdef HAVE_SIMGRID_MSG_H
			
 
				-			  #include <simgrid/msg.h>
			
 
				-			  #include <simgrid/host.h>
			
 
				-			  #else
			
 
				-			  #include <msg/msg.h>
			
 
				-			  #endif
			
 
				-			  ]])],,
			
 
				-			  CXXFLAGS="-std=c++11 $CXXFLAGS"
			
 
				-			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
			
 
				-	AC_LANG_POP([C++])
			
 
				-fi
			
 
				-AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
			
 
				-AC_SUBST(SIMGRID_CFLAGS)
			
 
				-AC_SUBST(SIMGRID_LIBS)
			
 
				-AC_MSG_CHECKING(whether SimGrid is enabled)
			
 
				-AC_MSG_RESULT($enable_simgrid)
			
 
				-
			
 
				-AC_MSG_CHECKING(whether blocking drivers should be enabled)
			
 
				-AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
			
 
				-				enable_blocking=$enableval, enable_blocking=$enable_simgrid)
			
 
				-AC_MSG_RESULT($enable_blocking)
			
 
				+AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
			
 
				+AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
			
 
				+			[disable asynchronous copy between CPU and MIC devices])],
			
 
				+			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
			
 
				+disable_asynchronous_mic_copy=no
			
 
				+if test x$enable_asynchronous_mic_copy = xno ; then
			
 
				+   disable_asynchronous_mic_copy=yes
			
 
				+fi
			
 
				+AC_MSG_RESULT($disable_asynchronous_mic_copy)
			
 
				+if test x$disable_asynchronous_mic_copy = xyes ; then
			
 
				+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
			
 
				+fi
			
 
				 
			
 
				-if test x$enable_blocking = xno ; then
			
 
				-	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
			
 
				+AC_MSG_CHECKING(whether asynchronous MPI Master Slave copy should be disabled)
			
 
				+AC_ARG_ENABLE(asynchronous-mpi-master-slave-copy, [AS_HELP_STRING([--disable-asynchronous-mpi-master-slave-copy],
			
 
				+			[disable asynchronous copy between MPI Master and MPI Slave devices])],
			
 
				+			enable_asynchronous_mpi_master_slave_copy=$enableval, enable_asynchronous_mpi_master_slave_copy=yes)
			
 
				+disable_asynchronous_mpi_master_slave_copy=no
			
 
				+if test x$enable_asynchronous_mpi_master_slave_copy = xno ; then
			
 
				+   disable_asynchronous_mpi_master_slave_copy=yes
			
 
				+fi
			
 
				+AC_MSG_RESULT($disable_asynchronous_mpi_master_slave_copy)
			
 
				+if test x$disable_asynchronous_mpi_master_slave_copy = xyes ; then
			
 
				+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY], [1], [Define to 1 to disable asynchronous copy between MPI Master and MPI Slave devices])
			
 
				 fi
			
 
				 
			
 
				 ###############################################################################
			
@@ -1733,238 +2001,6 @@ AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				-#                                    MPI                                      #
			
 
				-#                                                                             #
			
 
				-###############################################################################
			
 
				-
			
 
				-AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
			
 
				-                              [Disable StarPU MPI library generation])],
			
 
				-            [enable_mpi=$enableval],
			
 
				-            [enable_mpi=yes])
			
 
				-
			
 
				-AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
			
 
				-                              [Enable StarPU to run with the master-slave mode])],
			
 
				-            use_mpi_master_slave=$enableval,
			
 
				-            use_mpi_master_slave=no)
			
 
				-
			
 
				-#Check MPICC
			
 
				-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
			
 
				-           [Path of the mpicc compiler])],
			
 
				-   [
			
 
				-       if test x$withval = xyes; then
			
 
				-           AC_MSG_ERROR(--with-mpicc must be given a pathname)
			
 
				-       else
			
 
				-           mpicc_path=$withval
			
 
				-       fi
			
 
				-   ],
			
 
				-   [
			
 
				-       if test x$enable_simgrid = xyes ; then
			
 
				-           DEFAULT_MPICC=smpicc
			
 
				-       else
			
 
				-           DEFAULT_MPICC=mpicc
			
 
				-       fi
			
 
				-       # nothing was specified: default value is used
			
 
				-       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				-   ])
			
 
				-
			
 
				-# We test if the MPICC compiler exists
			
 
				-if test ! -x $mpicc_path; then
			
 
				-    #MPICC does not exists or is not executable
			
 
				-    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				-    use_mpi=no
			
 
				-else
			
 
				-    use_mpi=yes
			
 
				-    if test x$enable_simgrid = xyes ; then
			
 
				-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
			
 
				-                    [Path of the smpirun helper])],
			
 
				-            [
			
 
				-                if test x$withval = xyes; then
			
 
				-                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
			
 
				-                else
			
 
				-                    smpirun_path=$withval
			
 
				-                fi
			
 
				-            ],
			
 
				-            [
			
 
				-                # nothing was specified: default value is used
			
 
				-                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
			
 
				-            ])
			
 
				-
			
 
				-    fi
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(mpicc path)
			
 
				-AC_MSG_RESULT($mpicc_path)
			
 
				-AC_SUBST(MPICC, $mpicc_path)
			
 
				-
			
 
				-
			
 
				-#Check MPICXX/MPIC++
			
 
				-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
			
 
				-           [Path of the mpicxx/mpic++ compiler])],
			
 
				-   [
			
 
				-       if test x$withval = xyes; then
			
 
				-           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
			
 
				-       else
			
 
				-           mpicxx_path=$withval
			
 
				-       fi
			
 
				-   ],
			
 
				-   [
			
 
				-       if test x$enable_simgrid = xyes ; then
			
 
				-           DEFAULT_MPICXX=smpicxx
			
 
				-       else
			
 
				-           DEFAULT_MPICXX=mpicxx
			
 
				-       fi
			
 
				-       # nothing was specified: default value is used
			
 
				-       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				-       
			
 
				-       # try with mpic++ if mpicxx was not found
			
 
				-       if test x$mpicxx_path = xno ; then
			
 
				-            DEFAULT_MPICXX=mpic++
			
 
				-            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				-       fi
			
 
				-   ])
			
 
				-
			
 
				-# We test if the MPICXX/MPIC++ compiler exists
			
 
				-if test ! -x $mpicxx_path; then
			
 
				-    #MPICXX/MPIC++ does not exists or is not executable
			
 
				-    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
			
 
				-    use_mpicxx=no
			
 
				-else
			
 
				-    use_mpicxx=yes
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(mpicxx/mpic++ path)
			
 
				-AC_MSG_RESULT($mpicxx_path)
			
 
				-AC_SUBST(MPICXX, $mpicxx_path)
			
 
				-
			
 
				-
			
 
				-if test x$use_mpi = xyes -a x$enable_mpi = xyes; then
			
 
				-    cc_or_mpicc=$mpicc_path
			
 
				-        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				-        # libstarpumpi.
			
 
				-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				-        # references to MPI_*). We manually add the required flags to fix this
			
 
				-        # issue.
			
 
				-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				-else
			
 
				-    cc_or_mpicc=$CC
			
 
				-fi
			
 
				-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				-
			
 
				-# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
			
 
				-AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
			
 
				-running_mpi_check=no
			
 
				-if test $svndir = 1 -o -d "$srcdir/.git" ; then
			
 
				-    running_mpi_check=yes
			
 
				-fi
			
 
				-if test x$enable_mpi_check = xyes ; then
			
 
				-    running_mpi_check=yes
			
 
				-fi
			
 
				-if test x$enable_mpi_check = xno ; then
			
 
				-    running_mpi_check=no
			
 
				-fi
			
 
				-
			
 
				-
			
 
				-# Check if mpiexec is available
			
 
				-AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
			
 
				-            [Path of mpiexec])],
			
 
				-    [
			
 
				-        if test x$withval = xyes; then
			
 
				-            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
			
 
				-        else
			
 
				-            mpiexec_path=$withval
			
 
				-        fi
			
 
				-    ],
			
 
				-    [
			
 
				-        # nothing was specified: look in the path
			
 
				-        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-    ])
			
 
				-
			
 
				-AC_MSG_CHECKING(whether mpiexec is available)
			
 
				-AC_MSG_RESULT($mpiexec_path)
			
 
				-
			
 
				-# We test if MPIEXEC exists
			
 
				-if test ! -x $mpiexec_path; then
			
 
				-    #MPIEXEC does not exists or is not executable
			
 
				-    AC_MSG_RESULT(The mpiexec script is not valid)
			
 
				-        running_mpi_check=no
			
 
				-        mpiexec_path=""
			
 
				-fi
			
 
				-
			
 
				-AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
			
 
				-if test x$use_mpi = xyes ; then
			
 
				-    AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				-    AC_MSG_RESULT($running_mpi_check)
			
 
				-    AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				-fi
			
 
				-
			
 
				-#We can only build StarPU MPI Library if User wants it and MPI is available
			
 
				-if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
			
 
				-    build_mpi_lib=yes
			
 
				-else
			
 
				-    build_mpi_lib=no
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
			
 
				-AC_MSG_RESULT($build_mpi_lib)
			
 
				-
			
 
				-AC_SUBST(USE_MPI, $build_mpi_lib)
			
 
				-AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
			
 
				-if test x$build_mpi_lib = xyes; then
			
 
				-	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
			
 
				-else
			
 
				-	running_mpi_check=no
			
 
				-fi
			
 
				-
			
 
				-AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
			
 
				-			[Arguments for mpiexec])],
			
 
				-	[
			
 
				-		mpiexec_args=$withval
			
 
				-	])
			
 
				-AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
			
 
				-
			
 
				-AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
			
 
				-				   [Enable StarPU MPI activity polling method])],
			
 
				-				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
			
 
				-if  test x$enable_mpi_progression_hook = xyes; then
			
 
				-	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
			
 
				-fi
			
 
				-
			
 
				-#We can only build MPI Master Slave if User wants it and MPI is available
			
 
				-if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
			
 
				-    build_mpi_master_slave=yes
			
 
				-else
			
 
				-    build_mpi_master_slave=no
			
 
				-fi
			
 
				-
			
 
				-if test x$build_mpi_master_slave = xyes; then
			
 
				-    AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
			
 
				-    CC=$mpicc_path    
			
 
				-    CCLD=$mpicc_path      
			
 
				-    CXX=$mpicxx_path      
			
 
				-    CXXLD=mpicxx_path    
			
 
				-fi
			
 
				-
			
 
				-AC_MSG_CHECKING(whether the master-slave mode should be enabled)
			
 
				-AC_MSG_RESULT($build_mpi_master_slave)
			
 
				-AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
			
 
				-
			
 
				-AC_MSG_CHECKING(maximum number of MPI master-slave devices)
			
 
				-AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
			
 
				-			[maximum number of MPI master-slave devices])],
			
 
				-			nmaxmpidev=$enableval,
			
 
				-            [
			
 
				-             if test x$build_mpi_master_slave = xyes; then
			
 
				-                 nmaxmpidev=4
			
 
				-             else
			
 
				-                 nmaxmpidev=0
			
 
				-             fi
			
 
				-            ])
			
 
				-AC_MSG_RESULT($nmaxmpidev)
			
 
				-AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
			
 
				-
			
 
				-
			
 
				-###############################################################################
			
 
				-#                                                                             #
			
 
				 #                  Miscellaneous options for StarPU                           #
			
 
				 #                                                                             #
			
 
				 ###############################################################################
			
@@ -2087,9 +2123,13 @@ if test x$enable_simgrid != xyes; then
 
				 	if test x$enable_rcce != xyes; then
			
 
				 		nmaxsccdev=0
			
 
				 	fi
			
 
				+    #By default, if we cannot build mpi master-slave nmaxmpidev is set to zero.
			
 
				+    #But with the multiplication with maxcpus, we need to put it to one.
			
 
				+    if test x$build_mpi_master_slave != xyes; then
			
 
				+        nmaxmpidev=1
			
 
				+    fi
			
 
				 fi
			
 
				-#We suppose Master adds nmaxmpidev workers but slaves don't.
			
 
				-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxmpidev + $nmaxsccdev + 15 \) / 16 \) `
			
 
				+nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
			
 
				 AC_MSG_CHECKING(Maximum number of workers)
			
 
				 AC_MSG_RESULT($nmaxworkers)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -178,11 +178,15 @@ LOADER			=
 
				 LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_MPI_MASTER_SLAVE
			
 
				+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+endif
			
 
				+
			
 
				 if STARPU_HAVE_AM111
			
 
				 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER		=	$(LOADER_BIN)
			
 
				+LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 endif
			
--- a/examples/binary/binary.c
+++ b/examples/binary/binary.c
@@ -127,6 +127,7 @@ int main(int argc, char **argv)
 
				 	conf.ncuda = 0;
			
 
				 	conf.nmic = 0;
			
 
				 	conf.nscc = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				 
			
 
				         ret = starpu_init(&conf);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -79,11 +79,15 @@ LOADER			=
 
				 LOADER_BIN		=	$(top_builddir)/examples/stencil/loader-cross.sh
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_MPI_MASTER_SLAVE
			
 
				+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+endif
			
 
				+
			
 
				 if STARPU_HAVE_AM111
			
 
				 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER		=	$(LOADER_BIN)
			
 
				+LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 endif
			
--- a/examples/stencil/implicit-stencil-blocks.c
+++ b/examples/stencil/implicit-stencil-blocks.c
@@ -333,7 +333,7 @@ void allocate_memory_on_node(int rank)
 
				 			starpu_block_data_register(&block->boundaries_handle[B][1], -1, (uintptr_t) NULL, (sizex + 2*K), (sizex + 2*K)*(sizey + 2*K), (sizex + 2*K), (sizey + 2*K), K, sizeof(TYPE));
			
 
				 		}
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI)  && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 		/* Register all data to StarPU-MPI, even the ones that are not
			
 
				 		 * allocated on the local node. */
			
 
				 
			
--- a/examples/stencil/implicit-stencil-kernels.c
+++ b/examples/stencil/implicit-stencil-kernels.c
@@ -192,7 +192,7 @@ static void update_func_cuda(void *descr[], void *arg)
 
				 		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
@@ -282,7 +282,7 @@ static void update_func_opencl(void *descr[], void *arg)
 
				 		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
@@ -355,7 +355,7 @@ void update_func_cpu(void *descr[], void *arg)
 
				 		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d            !!!\n", rank);
			
--- a/examples/stencil/implicit-stencil-tasks.c
+++ b/examples/stencil/implicit-stencil-tasks.c
@@ -35,7 +35,7 @@
 
				 # define DEBUG(fmt, ...)
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 #include <starpu_mpi.h>
			
 
				 #define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
			
 
				 #endif
			
--- a/examples/stencil/implicit-stencil.c
+++ b/examples/stencil/implicit-stencil.c
@@ -193,7 +193,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 
				 
			
 
				 unsigned global_workerid(unsigned local_workerid)
			
 
				 {
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	unsigned workers_per_node = starpu_worker_get_count();
			
@@ -210,7 +210,7 @@ int main(int argc, char **argv)
 
				 	int world_size;
			
 
				 	int ret;
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int thread_support;
			
 
				 	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
			
 
				 	{
			
@@ -237,7 +237,7 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 #endif
			
@@ -249,7 +249,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	init_problem(argc, argv, rank, world_size);
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 #endif
			
@@ -270,7 +270,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 #endif
			
@@ -288,7 +288,7 @@ int main(int argc, char **argv)
 
				 	double max_timing = timing;
			
 
				 	double sum_timing = timing;
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int reduce_ret;
			
 
				 
			
 
				 	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
			
@@ -368,13 +368,13 @@ int main(int argc, char **argv)
 
				 
			
 
				 	free_problem(rank);
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	starpu_mpi_shutdown();
			
 
				 #endif
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	MPI_Finalize();
			
 
				 #endif
			
 
				 
			
--- a/examples/stencil/implicit-stencil.h
+++ b/examples/stencil/implicit-stencil.h
@@ -23,7 +23,7 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 #ifndef __CUDACC__
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 #include <mpi.h>
			
 
				 #include <starpu_mpi.h>
			
 
				 #endif
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -189,7 +189,7 @@ static void update_func_cuda(void *descr[], void *arg)
 
				 		FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
@@ -276,7 +276,7 @@ static void update_func_opencl(void *descr[], void *arg)
 
				 		FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d              !!!\n", rank);
			
@@ -346,7 +346,7 @@ void update_func_cpu(void *descr[], void *arg)
 
				 		FPRINTF(stderr,"!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				 	else
			
 
				 		DEBUG( "!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank = 0;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	DEBUG( "!!!           RANK %d            !!!\n", rank);
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -82,7 +82,7 @@ static void send_done(void *arg)
 
				 	DEBUG("DO SEND %d\n", (int)z);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 /* Post MPI send */
			
 
				 static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				 {
			
@@ -138,7 +138,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
 
				 	int node_z = get_block_mpi_node(z);
			
 
				 	int node_z_and_d = get_block_mpi_node(z+dir);
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	if (node_z == local_rank)
			
 
				 	{
			
 
				 		/* Save data from update */
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -188,7 +188,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 
				 
			
 
				 unsigned global_workerid(unsigned local_workerid)
			
 
				 {
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int rank;
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	unsigned workers_per_node = starpu_worker_get_count();
			
@@ -205,7 +205,7 @@ int main(int argc, char **argv)
 
				 	int world_size;
			
 
				 	int ret;
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int thread_support;
			
 
				 	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support))
			
 
				 	{
			
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 #endif
			
@@ -246,7 +246,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	create_tasks(rank);
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 #endif
			
@@ -263,7 +263,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 #endif
			
@@ -274,7 +274,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/*display_debug(nbz, niter, rank);*/
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	starpu_mpi_shutdown();
			
 
				 #endif
			
 
				 
			
@@ -285,7 +285,7 @@ int main(int argc, char **argv)
 
				 	double max_timing = timing;
			
 
				 	double sum_timing = timing;
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	int reduce_ret;
			
 
				 
			
 
				 	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
			
@@ -366,7 +366,7 @@ int main(int argc, char **argv)
 
				 	free_problem(rank);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	MPI_Finalize();
			
 
				 #endif
			
 
				 
			
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -23,7 +23,7 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 #ifndef __CUDACC__
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 #include <mpi.h>
			
 
				 #include <starpu_mpi.h>
			
 
				 #endif
			
--- a/include/schedulers/starpu_heteroprio.h
+++ b/include/schedulers/starpu_heteroprio.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015  INRIA
			
 
				+ * Copyright (C) 2015, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -42,6 +42,7 @@ enum starpu_heteroprio_types
 
				 	STARPU_CUDA_IDX,
			
 
				 	STARPU_OPENCL_IDX,
			
 
				 	STARPU_MIC_IDX,
			
 
				+	STARPU_MPI_MS_IDX,
			
 
				 	STARPU_SCC_IDX,
			
 
				 // This will be the number of archs
			
 
				 	STARPU_NB_TYPES
			
@@ -54,6 +55,7 @@ static const unsigned starpu_heteroprio_types_to_arch[STARPU_NB_TYPES+1] =
 
				 	STARPU_OPENCL,
			
 
				 	STARPU_MIC,
			
 
				 	STARPU_SCC,
			
 
				+        STARPU_MPI_MS,
			
 
				 	0
			
 
				 };
			
 
				 
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2014, 2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2015  CNRS
			
 
				- * Copyright (C) 2014  INRIA
			
 
				+ * Copyright (C) 2014, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -90,6 +90,7 @@ struct starpu_conf
 
				 	int nopencl;
			
 
				 	int nmic;
			
 
				 	int nscc;
			
 
				+        int nmpi_ms;
			
 
				 
			
 
				 	unsigned use_explicit_workers_bindid;
			
 
				 	unsigned workers_bindid[STARPU_NMAXWORKERS];
			
@@ -106,6 +107,9 @@ struct starpu_conf
 
				 	unsigned use_explicit_workers_scc_deviceid;
			
 
				 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	unsigned use_explicit_workers_mpi_deviceid;
			
 
				+	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	int bus_calibrate;
			
 
				 	int calibrate;
			
 
				 
			
@@ -117,6 +121,7 @@ struct starpu_conf
 
				 	int disable_asynchronous_cuda_copy;
			
 
				 	int disable_asynchronous_opencl_copy;
			
 
				 	int disable_asynchronous_mic_copy;
			
 
				+	int disable_asynchronous_mpi_ms_copy;
			
 
				 
			
 
				 	unsigned *cuda_opengl_interoperability;
			
 
				 	unsigned n_cuda_opengl_interoperability;
			
@@ -146,6 +151,7 @@ int starpu_asynchronous_copy_disabled(void);
 
				 int starpu_asynchronous_cuda_copy_disabled(void);
			
 
				 int starpu_asynchronous_opencl_copy_disabled(void);
			
 
				 int starpu_asynchronous_mic_copy_disabled(void);
			
 
				+int starpu_asynchronous_mpi_ms_copy_disabled(void);
			
 
				 
			
 
				 void starpu_display_stats();
			
 
				 
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -35,6 +35,7 @@
 
				 #undef STARPU_USE_OPENCL
			
 
				 #undef STARPU_USE_MIC
			
 
				 #undef STARPU_USE_SCC
			
 
				+#undef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 
			
 
				 #undef STARPU_OPENMP
			
 
				 
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -117,7 +117,8 @@ enum starpu_node_kind
 
				 	STARPU_DISK_RAM   = 0x04,
			
 
				 	STARPU_MIC_RAM    = 0x05,
			
 
				 	STARPU_SCC_RAM    = 0x06,
			
 
				-	STARPU_SCC_SHM    = 0x07
			
 
				+	STARPU_SCC_SHM    = 0x07,
			
 
				+	STARPU_MPI_MS_RAM = 0x08
			
 
				 
			
 
				 };
			
 
				 
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2014  CNRS
			
 
				- * Copyright (C) 2011-2012  INRIA
			
 
				+ * Copyright (C) 2011-2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -59,6 +59,10 @@ struct starpu_data_copy_methods
 
				 	int (*scc_sink_to_src)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 	int (*scc_sink_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 
			
 
				+	int (*ram_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+	int (*mpi_ms_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+	int (*mpi_ms_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, starpu_cudaStream_t stream);
			
 
				 	int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, starpu_cudaStream_t stream);
			
@@ -79,6 +83,10 @@ struct starpu_data_copy_methods
 
				 	int (*opencl_to_opencl_async)();
			
 
				 #endif
			
 
				 
			
 
				+	int (*ram_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
			
 
				+	int (*mpi_ms_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
			
 
				+	int (*mpi_ms_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event);
			
 
				+
			
 
				 	int (*ram_to_mic_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
			
 
				 
			
--- a/include/starpu_mpi_ms.h
+++ b/include/starpu_mpi_ms.h
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_MS_H__
			
 
				+#define __STARPU_MPI_MS_H__
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+typedef void *starpu_mpi_ms_func_symbol_t;
			
 
				+
			
 
				+int starpu_mpi_ms_register_kernel(starpu_mpi_ms_func_symbol_t *symbol, const char *func_name);
			
 
				+
			
 
				+starpu_mpi_ms_kernel_t starpu_mpi_ms_get_kernel(starpu_mpi_ms_func_symbol_t symbol);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* STARPU_USE_MIC */
			
 
				+#endif /* __STARPU_MIC_H__ */
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -3,7 +3,7 @@
 
				  * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				- * Copyright (C) 2011, 2014  INRIA
			
 
				+ * Copyright (C) 2011, 2014, 2016  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -43,6 +43,7 @@ extern "C"
 
				 #define STARPU_OPENCL	((1ULL)<<6)
			
 
				 #define STARPU_MIC	((1ULL)<<7)
			
 
				 #define STARPU_SCC	((1ULL)<<8)
			
 
				+#define STARPU_MPI_MS	((1ULL)<<9)
			
 
				 
			
 
				 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
			
 
				 #define STARPU_CUDA_ASYNC	(1<<0)
			
@@ -75,9 +76,11 @@ typedef void (*starpu_cpu_func_t)(void **, void*);
 
				 typedef void (*starpu_cuda_func_t)(void **, void*);
			
 
				 typedef void (*starpu_opencl_func_t)(void **, void*);
			
 
				 typedef void (*starpu_mic_kernel_t)(void **, void*);
			
 
				+typedef void (*starpu_mpi_ms_kernel_t)(void **, void*);
			
 
				 typedef void (*starpu_scc_kernel_t)(void **, void*);
			
 
				 
			
 
				 typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
			
 
				+typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
			
 
				 typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
			
 
				 
			
 
				 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
			
@@ -104,6 +107,7 @@ struct starpu_codelet
 
				 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				+	starpu_mpi_ms_func_t mpi_ms_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	const char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2014  CNRS
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -35,6 +36,7 @@ enum starpu_worker_archtype
 
				 	STARPU_OPENCL_WORKER,
			
 
				 	STARPU_MIC_WORKER,
			
 
				 	STARPU_SCC_WORKER,
			
 
				+	STARPU_MPI_WORKER,
			
 
				 	STARPU_ANY_WORKER
			
 
				 };
			
 
				 
			
@@ -89,6 +91,7 @@ unsigned starpu_cuda_worker_get_count(void);
 
				 unsigned starpu_opencl_worker_get_count(void);
			
 
				 unsigned starpu_mic_worker_get_count(void);
			
 
				 unsigned starpu_scc_worker_get_count(void);
			
 
				+unsigned starpu_mpi_ms_worker_get_count(void);
			
 
				 
			
 
				 unsigned starpu_mic_device_get_count(void);
			
 
				 
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -136,6 +136,9 @@ noinst_HEADERS = 						\
 
				 	drivers/scc/driver_scc_common.h				\
			
 
				 	drivers/scc/driver_scc_source.h				\
			
 
				 	drivers/scc/driver_scc_sink.h				\
			
 
				+	drivers/mpi/driver_mpi_common.h				\
			
 
				+	drivers/mpi/driver_mpi_source.h				\
			
 
				+	drivers/mpi/driver_mpi_sink.h				\
			
 
				 	drivers/disk/driver_disk.h				\
			
 
				 	debug/traces/starpu_fxt.h				\
			
 
				 	profiling/bound.h					\
			
@@ -368,6 +371,19 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.
 
				 endif
			
 
				 
			
 
				 #########################################
			
 
				+#                                       # 	 
			
 
				+#     MPI Master/Slave compilation      # 	 
			
 
				+#                                       # 	 
			
 
				+######################################### 	 
			
 
				+
			
 
				+if STARPU_USE_MPI_MASTER_SLAVE 	 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_common.c 	 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_source.c 	 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_sink.c 	 
			
 
				+endif 	 
			
 
				+
			
 
				+
			
 
				+#########################################
			
 
				 
			
 
				 showcheck:
			
 
				 	-cat /dev/null
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2016  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -40,6 +41,7 @@
 
				 #define _STARPU_FUT_OPENCL_KEY	0x103
			
 
				 #define _STARPU_FUT_MIC_KEY	0x104
			
 
				 #define _STARPU_FUT_SCC_KEY	0x105
			
 
				+#define _STARPU_FUT_MPI_KEY	0x106
			
 
				 
			
 
				 #define _STARPU_FUT_WORKER_INIT_START	0x5100
			
 
				 #define _STARPU_FUT_WORKER_INIT_END	0x5101
			
@@ -52,10 +54,10 @@
 
				 
			
 
				 #define _STARPU_FUT_UPDATE_TASK_CNT	0x5106
			
 
				 
			
 
				-#define _STARPU_FUT_START_FETCH_INPUT	0x5107
			
 
				-#define _STARPU_FUT_END_FETCH_INPUT	0x5108
			
 
				-#define _STARPU_FUT_START_PUSH_OUTPUT	0x5109
			
 
				-#define _STARPU_FUT_END_PUSH_OUTPUT	0x5110
			
 
				+#define _STARPU_FUT_START_FETCH_INPUT_ON_TID	0x5107
			
 
				+#define _STARPU_FUT_END_FETCH_INPUT_ON_TID	0x5108
			
 
				+#define _STARPU_FUT_START_PUSH_OUTPUT_ON_TID	0x5109
			
 
				+#define _STARPU_FUT_END_PUSH_OUTPUT_ON_TID	0x5110
			
 
				 
			
 
				 #define _STARPU_FUT_TAG		0x5111
			
 
				 #define _STARPU_FUT_TAG_DEPS	0x5112
			
@@ -104,8 +106,8 @@
 
				 #define	_STARPU_FUT_START_DRIVER_COPY_ASYNC	0x5135
			
 
				 #define	_STARPU_FUT_END_DRIVER_COPY_ASYNC	0x5136
			
 
				 
			
 
				-#define	_STARPU_FUT_START_PROGRESS	0x5137
			
 
				-#define	_STARPU_FUT_END_PROGRESS		0x5138
			
 
				+#define	_STARPU_FUT_START_PROGRESS_ON_TID	0x5137
			
 
				+#define	_STARPU_FUT_END_PROGRESS_ON_TID		0x5138
			
 
				 
			
 
				 #define _STARPU_FUT_USER_EVENT		0x5139
			
 
				 
			
@@ -151,8 +153,8 @@
 
				 
			
 
				 #define _STARPU_FUT_DATA_LOAD 0x5153
			
 
				 
			
 
				-#define _STARPU_FUT_START_UNPARTITION 0x5154
			
 
				-#define _STARPU_FUT_END_UNPARTITION 0x5155
			
 
				+#define _STARPU_FUT_START_UNPARTITION_ON_TID 0x5154
			
 
				+#define _STARPU_FUT_END_UNPARTITION_ON_TID 0x5155
			
 
				 
			
 
				 #define	_STARPU_FUT_START_FREE		0x5156
			
 
				 #define	_STARPU_FUT_END_FREE		0x5157
			
@@ -209,6 +211,9 @@
 
				 #define _STARPU_FUT_HANDLE_DATA_REGISTER 0x517c
			
 
				 #define _STARPU_FUT_DATA_INVALIDATE 0x517d
			
 
				 
			
 
				+#define _STARPU_FUT_START_FETCH_INPUT	0x517e
			
 
				+#define _STARPU_FUT_END_FETCH_INPUT	0x517f
			
 
				+
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 #include <fxt/fxt.h>
			
 
				 #include <fxt/fut.h>
			
@@ -525,16 +530,22 @@ do {									\
 
				 	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
			
 
				 
			
 
				 #define _STARPU_TRACE_START_FETCH_INPUT(job)	\
			
 
				-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, _starpu_gettid());
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT_ON_TID, job, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_END_FETCH_INPUT(job)	\
			
 
				-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, _starpu_gettid());
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT_ON_TID, job, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_START_PUSH_OUTPUT(job)	\
			
 
				-	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT, job, _starpu_gettid());
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_END_PUSH_OUTPUT(job)	\
			
 
				-	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT, job, _starpu_gettid());
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
			
 
				+
			
 
				+#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	\
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, id);
			
 
				+
			
 
				+#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	\
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, id);
			
 
				 
			
 
				 #define _STARPU_TRACE_TAG(tag, job)	\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_TAG, tag, (job)->job_id)
			
@@ -903,10 +914,10 @@ do {										\
 
				 	FUT_DO_PROBE2(_STARPU_FUT_DATA_LOAD, workerid, size);
			
 
				 
			
 
				 #define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
			
 
				-	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION, memnode, _starpu_gettid(), handle);
			
 
				+	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
			
 
				 	
			
 
				 #define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
			
 
				-	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION, memnode, _starpu_gettid(), handle);
			
 
				+	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
			
 
				 
			
 
				 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(workerid, ntasks, exp_len)		\
			
 
				 	FUT_DO_PROBE4(_STARPU_FUT_SCHED_COMPONENT_PUSH_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
			
@@ -1046,6 +1057,8 @@ do {										\
 
				 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {} while (0)
			
 
				 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {} while (0)
			
 
				 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {} while (0)
			
 
				+#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	do {} while(0)
			
 
				 
			
 
				 #endif // STARPU_USE_FXT
			
 
				 
			
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -271,7 +271,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+			_starpu_datawizard_progress(0);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -130,7 +130,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
				 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 		{
			
 
				 			cpt++;
			
 
				-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+			_starpu_datawizard_progress(0);
			
 
				 		}
			
 
				 		if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -123,6 +123,8 @@ double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arc
 
				 			coef = _STARPU_MIC_ALPHA;
			
 
				 		else if (perf_arch->devices[dev].type == STARPU_SCC_WORKER)
			
 
				 			coef = _STARPU_SCC_ALPHA;
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_MPI_WORKER)
			
 
				+			coef = _STARPU_MPI_MS_ALPHA;
			
 
				 
			
 
				 		speedup += coef * (perf_arch->devices[dev].ncores);
			
 
				 	}
			
@@ -263,6 +265,9 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 			case STARPU_SCC_WORKER:
			
 
				 				node_kind = STARPU_SCC_RAM;
			
 
				 				break;
			
 
				+			case STARPU_MPI_WORKER:
			
 
				+				node_kind = STARPU_MPI_MS_RAM;
			
 
				+				break;
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 				break;
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -37,6 +37,7 @@
 
				 #include <core/simgrid.h>
			
 
				 #include <core/topology.h>
			
 
				 #include <common/utils.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 #include <starpu_opencl.h>
			
@@ -77,6 +78,7 @@ static unsigned ncpus = 0;
 
				 static unsigned ncuda = 0;
			
 
				 static unsigned nopencl = 0;
			
 
				 static unsigned nmic = 0;
			
 
				+static unsigned nmpi_ms = 0;
			
 
				 
			
 
				 /* Benchmarking the performance of the bus */
			
 
				 
			
@@ -121,6 +123,11 @@ static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
 
				 static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+static double mpi_time_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
			
 
				+static double mpi_latency_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
			
 
				+#endif
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 static hwloc_topology_t hwtopology;
			
 
				 #endif
			
@@ -663,7 +670,7 @@ static void benchmark_all_gpu_devices(void)
 
				 	_STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
			
 
				 	STARPU_ABORT();
			
 
				 #else /* !SIMGRID */
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	unsigned i;
			
 
				 #endif
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
@@ -739,6 +746,12 @@ static void benchmark_all_gpu_devices(void)
 
				 	}
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+    
			
 
				+        _starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
			
 
				+
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
			
 
				 	hwloc_bitmap_free(former_cpuset);
			
@@ -928,6 +941,12 @@ static void generate_bus_affinity_file(void)
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* Slaves don't write files */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                return;
			
 
				+#endif
			
 
				+
			
 
				 	write_bus_affinity_file_content();
			
 
				 }
			
 
				 
			
@@ -1145,6 +1164,9 @@ static void write_bus_latency_file_content(void)
 
				 #ifdef STARPU_USE_MIC
			
 
				         maxnode += nmic;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        maxnode += nmpi_ms;
			
 
				+#endif
			
 
				         for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				 	{
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
@@ -1177,12 +1199,48 @@ static void write_bus_latency_file_content(void)
 
				 				}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-				if (src > ncuda)
			
 
				+				if (src > ncuda && src <= ncuda + nopencl)
			
 
				 					latency += opencldev_latency_dtoh[src-ncuda];
			
 
				-				if (dst > ncuda)
			
 
				+				if (dst > ncuda && dst <= ncuda + nopencl)
			
 
				 					latency += opencldev_latency_htod[dst-ncuda];
			
 
				 #endif
			
 
				-			}
			
 
				+                                /* TODO Latency MIC */
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                                /* Modify MPI src and MPI dst if they contain the master node or not 
			
 
				+                                 * Because, we only take care about slaves */
			
 
				+                                int mpi_master = _starpu_mpi_common_get_src_node();
			
 
				+
			
 
				+                                int mpi_src = src - (ncuda + nopencl + nmic) - 1;
			
 
				+                                mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
			
 
				+
			
 
				+                                int mpi_dst = dst - (ncuda + nopencl + nmic) - 1;
			
 
				+                                mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
			
 
				+
			
 
				+                                if (src > ncuda + nopencl + nmic && src <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                {
			
 
				+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                        {
			
 
				+                                                /* src and dst identify 2 MPI devices */
			
 
				+                                                latency += mpi_latency_device_to_device[mpi_src][mpi_dst];
			
 
				+                                        }
			
 
				+                                        else
			
 
				+                                        {
			
 
				+                                                /* Only src represents an MPI device 
			
 
				+                                                 * So we add latency between src and master */
			
 
				+                                                latency += mpi_latency_device_to_device[mpi_src][mpi_master];
			
 
				+                                        }
			
 
				+                                }
			
 
				+                                else
			
 
				+                                {
			
 
				+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                        {
			
 
				+                                                /* Only dst identifies an MPI device 
			
 
				+                                                 * So we add latency between master and dst */
			
 
				+                                                latency += mpi_latency_device_to_device[mpi_master][mpi_dst];
			
 
				+                                        }
			
 
				+                                }
			
 
				+#endif
			
 
				+                        }
			
 
				 
			
 
				 			if (dst)
			
 
				 				fputc('\t', f);
			
@@ -1203,6 +1261,12 @@ static void generate_bus_latency_file(void)
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* Slaves don't write files */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                return;
			
 
				+#endif
			
 
				+
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	write_bus_latency_file_content();
			
 
				 #endif
			
@@ -1366,6 +1430,9 @@ static void write_bus_bandwidth_file_content(void)
 
				 #ifdef STARPU_USE_MIC
			
 
				         maxnode += nmic;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        maxnode += nmpi_ms;
			
 
				+#endif
			
 
				 	for (src = 0; src < STARPU_MAXNODES; src++)
			
 
				 	{
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
@@ -1376,7 +1443,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 			{
			
 
				 				bandwidth = NAN;
			
 
				 			}
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 			else if (src != dst)
			
 
				 			{
			
 
				 				double slowness = 0.0;
			
@@ -1403,11 +1470,47 @@ static void write_bus_bandwidth_file_content(void)
 
				 					slowness += opencldev_timing_htod[dst-ncuda];
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-				if (src > ncuda + nopencl)
			
 
				+				if (src > ncuda + nopencl && src <= ncuda + nopencl + nmic)
			
 
				 					slowness += mic_time_device_to_host[src - (ncuda + nopencl)];
			
 
				-				if (dst > ncuda + nopencl)
			
 
				+				if (dst > ncuda + nopencl && dst <= ncuda + nopencl + nmic)
			
 
				 					slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                                /* Modify MPI src and MPI dst if they contain the master node or not 
			
 
				+                                 * Because, we only take care about slaves */
			
 
				+                                int mpi_master = _starpu_mpi_common_get_src_node();
			
 
				+
			
 
				+                                int mpi_src = src - (ncuda + nopencl + nmic) - 1;
			
 
				+                                mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
			
 
				+
			
 
				+                                int mpi_dst = dst - (ncuda + nopencl + nmic) - 1;
			
 
				+                                mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
			
 
				+
			
 
				+                                /* here we have bandwidth */
			
 
				+                                if (src > ncuda + nopencl + nmic && src <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                {
			
 
				+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                        {
			
 
				+                                                /* src and dst identify 2 MPI devices */
			
 
				+                                                slowness += 1.0/mpi_time_device_to_device[mpi_src][mpi_dst];
			
 
				+                                        }
			
 
				+                                        else
			
 
				+                                        {
			
 
				+                                                /* Only src represents an MPI device 
			
 
				+                                                 * So we add bandwidth between src and master */
			
 
				+                                                slowness += 1.0/mpi_time_device_to_device[mpi_src][mpi_master];
			
 
				+                                        }
			
 
				+                                }
			
 
				+                                else
			
 
				+                                {
			
 
				+                                        if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				+                                        {
			
 
				+                                                /* Only dst identifies an MPI device 
			
 
				+                                                 * So we add bandwidth between master and dst */
			
 
				+                                                slowness += 1.0/mpi_time_device_to_device[mpi_master][mpi_dst];
			
 
				+                                        }
			
 
				+                                }
			
 
				+#endif
			
 
				 				bandwidth = 1.0/slowness;
			
 
				 			}
			
 
				 #endif
			
@@ -1457,6 +1560,9 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 #ifdef STARPU_USE_MIC
			
 
				         maxnode += nmic;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        maxnode += nmpi_ms;
			
 
				+#endif
			
 
				 
			
 
				 	fprintf(f, "from/to\t");
			
 
				 	fprintf(f, "RAM\t");
			
@@ -1466,6 +1572,8 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 		fprintf(f, "OpenCL%u\t", dst);
			
 
				 	for (dst = 0; dst < nmic; dst++)
			
 
				 		fprintf(f, "MIC%u\t", dst);
			
 
				+	for (dst = 0; dst < nmpi_ms; dst++)
			
 
				+		fprintf(f, "MPI_MS%d\t", dst);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				 	for (src = 0; src <= maxnode; src++)
			
@@ -1476,8 +1584,10 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 			fprintf(f, "CUDA %u\t", src-1);
			
 
				 		else if (src <= ncuda + nopencl)
			
 
				 			fprintf(f, "OpenCL%u\t", src-ncuda-1);
			
 
				-		else
			
 
				+		else if (src <= ncuda + nopencl + nmic)
			
 
				 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
			
 
				+                else
			
 
				+			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
			
 
				 		for (dst = 0; dst <= maxnode; dst++)
			
 
				 			fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
			
 
				 
			
@@ -1493,8 +1603,10 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 			fprintf(f, "CUDA %u\t", src-1);
			
 
				 		else if (src <= ncuda + nopencl)
			
 
				 			fprintf(f, "OpenCL%u\t", src-ncuda-1);
			
 
				-		else
			
 
				+		else if (src <= ncuda + nopencl + nmic)
			
 
				 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
			
 
				+                else
			
 
				+			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
			
 
				 		for (dst = 0; dst <= maxnode; dst++)
			
 
				 			fprintf(f, "%.0f\t", latency_matrix[src][dst]);
			
 
				 
			
@@ -1550,6 +1662,12 @@ static void generate_bus_bandwidth_file(void)
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				+    
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* Slaves don't write files */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                return;
			
 
				+#endif
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	write_bus_bandwidth_file_content();
			
@@ -1580,56 +1698,125 @@ static void get_config_path(char *path, size_t maxlen)
 
				 	get_bus_path("config", path, maxlen);
			
 
				 }
			
 
				 
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				+/* check if the master or one slave has to recalibrate */
			
 
				+static int mpi_check_recalibrate(int my_recalibrate)
			
 
				+{
			
 
				+        int nb_mpi = _starpu_mpi_src_get_device_count() + 1;
			
 
				+        int mpi_recalibrate[nb_mpi];
			
 
				+
			
 
				+        MPI_Allgather(&my_recalibrate, 1, MPI_INT, mpi_recalibrate, 1, MPI_INT, MPI_COMM_WORLD);
			
 
				+
			
 
				+        for (int i = 0; i < nb_mpi; i++)
			
 
				+        {
			
 
				+                if (mpi_recalibrate[i])
			
 
				+                {
			
 
				+                        return 1;
			
 
				+                        break;
			
 
				+                }
			
 
				+        }
			
 
				+        return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void compare_value_and_recalibrate(char * msg, unsigned val_file, unsigned val_detected)
			
 
				+{
			
 
				+        int recalibrate = 0;
			
 
				+        if (val_file != val_detected)
			
 
				+                recalibrate = 1;
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	//Send to each other to know if we had to recalibrate because someone cannot have the correct value in the config file
			
 
				+	recalibrate = mpi_check_recalibrate(recalibrate);
			
 
				+#endif
			
 
				+
			
 
				+        if (recalibrate)
			
 
				+        {
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                /* Only the master prints the message */
			
 
				+                if (_starpu_mpi_common_is_src_node())
			
 
				+#endif
			
 
				+                        _STARPU_DISP("Current configuration does not match the bus performance model (%s: (stored) %d != (current) %d), recalibrating...\n", msg, val_file, val_detected);
			
 
				+
			
 
				+                _starpu_bus_force_sampling();
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                if (_starpu_mpi_common_is_src_node())
			
 
				+#endif
			
 
				+                        _STARPU_DISP("... done\n");
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				 static void check_bus_config_file(void)
			
 
				 {
			
 
				         int res;
			
 
				         char path[256];
			
 
				         struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	int recalibrate = 0;
			
 
				 
			
 
				         get_config_path(path, sizeof(path));
			
 
				         res = access(path, F_OK);
			
 
				+
			
 
				 	if (res || config->conf.bus_calibrate > 0)
			
 
				+		recalibrate = 1;
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				+	//Send to each other to know if we had to recalibrate because someone cannot have the config file
			
 
				+	recalibrate = mpi_check_recalibrate(recalibrate);
			
 
				+#endif
			
 
				+
			
 
				+	if (recalibrate)
			
 
				 	{
			
 
				 		if (res)
			
 
				 			_STARPU_DISP("No performance model for the bus, calibrating...\n");
			
 
				 		_starpu_bus_force_sampling();
			
 
				 		if (res)
			
 
				 			_STARPU_DISP("... done\n");
			
 
				-        }
			
 
				-        else
			
 
				+	}
			
 
				+	else
			
 
				 	{
			
 
				                 FILE *f;
			
 
				                 int ret;
			
 
				-		unsigned read_cuda = -1, read_opencl = -1, read_mic = -1;
			
 
				+                unsigned read_cuda = -1, read_opencl = -1, read_mic = -1, read_mpi_ms = -1;
			
 
				                 unsigned read_cpus = -1;
			
 
				-		int locked;
			
 
				+                int locked;
			
 
				 
			
 
				                 // Loading configuration from file
			
 
				                 f = fopen(path, "r");
			
 
				                 STARPU_ASSERT(f);
			
 
				-		locked = _starpu_frdlock(f) == 0;
			
 
				+                locked = _starpu_frdlock(f) == 0;
			
 
				                 _starpu_drop_comments(f);
			
 
				+
			
 
				                 ret = fscanf(f, "%u\t", &read_cpus);
			
 
				-		STARPU_ASSERT(ret == 1);
			
 
				+                STARPU_ASSERT(ret == 1);
			
 
				                 _starpu_drop_comments(f);
			
 
				-		ret = fscanf(f, "%u\t", &read_cuda);
			
 
				-		STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+                ret = fscanf(f, "%u\t", &read_cuda);
			
 
				+                STARPU_ASSERT(ret == 1);
			
 
				                 _starpu_drop_comments(f);
			
 
				-		ret = fscanf(f, "%u\t", &read_opencl);
			
 
				-		STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+                ret = fscanf(f, "%u\t", &read_opencl);
			
 
				+                STARPU_ASSERT(ret == 1);
			
 
				                 _starpu_drop_comments(f);
			
 
				-		ret = fscanf(f, "%u\t", &read_mic);
			
 
				-		if (ret == 0)
			
 
				-			read_mic = 0;
			
 
				+
			
 
				+                ret = fscanf(f, "%u\t", &read_mic);
			
 
				+                if (ret == 0)
			
 
				+                        read_mic = 0;
			
 
				+                _starpu_drop_comments(f);
			
 
				+
			
 
				+                ret = fscanf(f, "%u\t", &read_mpi_ms);
			
 
				+                if (ret == 0)
			
 
				+                        read_mpi_ms = 0;
			
 
				                 _starpu_drop_comments(f);
			
 
				-		if (locked)
			
 
				-			_starpu_frdunlock(f);
			
 
				+
			
 
				+                if (locked)
			
 
				+                        _starpu_frdunlock(f);
			
 
				                 fclose(f);
			
 
				 
			
 
				                 // Loading current configuration
			
 
				                 ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-		ncuda = _starpu_get_cuda_device_count();
			
 
				+                ncuda = _starpu_get_cuda_device_count();
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 nopencl = _starpu_opencl_get_device_count();
			
@@ -1637,32 +1824,16 @@ static void check_bus_config_file(void)
 
				 #ifdef STARPU_USE_MIC
			
 
				                 nmic = _starpu_mic_src_get_device_count();
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                nmpi_ms = _starpu_mpi_src_get_device_count();
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				 
			
 
				                 // Checking if both configurations match
			
 
				-                if (read_cpus != ncpus)
			
 
				-		{
			
 
				-			_STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
			
 
				-                        _starpu_bus_force_sampling();
			
 
				-			_STARPU_DISP("... done\n");
			
 
				-                }
			
 
				-                else if (read_cuda != ncuda)
			
 
				-		{
			
 
				-                        _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
			
 
				-                        _starpu_bus_force_sampling();
			
 
				-			_STARPU_DISP("... done\n");
			
 
				-                }
			
 
				-                else if (read_opencl != nopencl)
			
 
				-		{
			
 
				-                        _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
			
 
				-                        _starpu_bus_force_sampling();
			
 
				-			_STARPU_DISP("... done\n");
			
 
				-                }
			
 
				-                else if (read_mic != nmic)
			
 
				-		{
			
 
				-                        _STARPU_DISP("Current configuration does not match the bus performance model (MIC: (stored) %d != (current) %d), recalibrating...\n", read_mic, nmic);
			
 
				-                        _starpu_bus_force_sampling();
			
 
				-			_STARPU_DISP("... done\n");
			
 
				-                }
			
 
				+                compare_value_and_recalibrate("CPUS", read_cpus, ncpus);
			
 
				+                compare_value_and_recalibrate("CUDA", read_cuda, ncuda);
			
 
				+                compare_value_and_recalibrate("OpenCL", read_opencl, nopencl);
			
 
				+                compare_value_and_recalibrate("MIC", read_mic, nmic);
			
 
				+                compare_value_and_recalibrate("MPI Master-Slave", read_mpi_ms, nmpi_ms);
			
 
				         }
			
 
				 }
			
 
				 
			
@@ -1687,6 +1858,7 @@ static void write_bus_config_file_content(void)
 
				         fprintf(f, "%u # Number of CUDA devices\n", ncuda);
			
 
				         fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
			
 
				         fprintf(f, "%u # Number of MIC devices\n", nmic);
			
 
				+        fprintf(f, "%d # Number of MPI devices\n", nmpi_ms);
			
 
				 
			
 
				 	if (locked)
			
 
				 		_starpu_fwrunlock(f);
			
@@ -1697,6 +1869,12 @@ static void generate_bus_config_file(void)
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				+    
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* Slaves don't write files */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                return;
			
 
				+#endif
			
 
				 
			
 
				 	write_bus_config_file_content();
			
 
				 }
			
@@ -2427,6 +2605,12 @@ static void generate_bus_platform_file(void)
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* Slaves don't write files */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                return;
			
 
				+#endif
			
 
				+
			
 
				 	write_bus_platform_file_content(3);
			
 
				 	write_bus_platform_file_content(4);
			
 
				 }
			
@@ -2480,12 +2664,23 @@ void _starpu_load_bus_performance_files(void)
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
			
 
				 	nopencl = _starpu_opencl_get_device_count();
			
 
				 #endif
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_SIMGRID)
			
 
				+        nmpi_ms = _starpu_mpi_src_get_device_count();
			
 
				+#endif
			
 
				 #if defined(STARPU_USE_MIC) || defined(STARPU_USE_SIMGRID)
			
 
				 	nmic = _starpu_mic_src_get_device_count();
			
 
				 #endif
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				         check_bus_config_file();
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        /* be sure that master wrote the perf files */
			
 
				+        _starpu_mpi_common_barrier();
			
 
				+#endif
			
 
				+
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	load_bus_affinity_file();
			
 
				 #endif
			
 
				 	load_bus_latency_file();
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -568,6 +568,8 @@ static enum starpu_worker_archtype _get_enum_type(int type)
 
				 			return STARPU_MIC_WORKER;
			
 
				         	case 4:
			
 
				 			return STARPU_SCC_WORKER;
			
 
				+        	case 5:
			
 
				+			return STARPU_MPI_WORKER;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 	}
			
@@ -715,7 +717,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 
				 		{
			
 
				 			fprintf(f, "####################\n");
			
 
				 			fprintf(f, "# DEV_%d\n", dev);
			
 
				-			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4)\n");
			
 
				+			fprintf(f, "# device type (CPU - 0, CUDA - 1, OPENCL - 2, MIC - 3, SCC - 4, MPI_MS - 5)\n");
			
 
				 			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].type);
			
 
				 
			
 
				 			fprintf(f, "####################\n");
			
@@ -904,11 +906,14 @@ void _starpu_initialize_registered_performance_models(void)
 
				 	unsigned i;
			
 
				 	for(i = 0; i < conf->topology.nhwmicdevices; i++)
			
 
				 		nmic += conf->topology.nhwmiccores[i];
			
 
				+	unsigned nmpi = 0;
			
 
				+	for(i = 0; i < conf->topology.nhwmpidevices; i++)
			
 
				+		nmpi += conf->topology.nhwmpicores[i];
			
 
				 	unsigned nscc = conf->topology.nhwscc;
			
 
				 
			
 
				-	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nscc), this is too big
			
 
				-	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nscc), and reallocate when necessary in starpu_perfmodel_arch_comb_add
			
 
				-	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nscc);
			
 
				+	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nscc + nmpi), this is too big
			
 
				+	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nscc + nmpi), and reallocate when necessary in starpu_perfmodel_arch_comb_add
			
 
				+	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nscc + nmpi);
			
 
				 	_STARPU_MALLOC(arch_combs, nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
			
 
				 	current_arch_comb = 0;
			
 
				 	STARPU_PTHREAD_RWLOCK_INIT(&arch_combs_mutex, NULL);
			
@@ -918,6 +923,7 @@ void _starpu_initialize_registered_performance_models(void)
 
				 	ignore_devid[STARPU_CUDA_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CUDA", 0);
			
 
				 	ignore_devid[STARPU_OPENCL_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL", 0);
			
 
				 	ignore_devid[STARPU_MIC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MIC", 0);
			
 
				+	ignore_devid[STARPU_MPI_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS", 0);
			
 
				 	ignore_devid[STARPU_SCC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_SCC", 0);
			
 
				 }
			
 
				 
			
@@ -1200,6 +1206,9 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 
				 		case(STARPU_SCC_WORKER):
			
 
				 			return "scc";
			
 
				 			break;
			
 
				+		case(STARPU_MPI_WORKER):
			
 
				+			return "mpi_ms";
			
 
				+			break;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 			break;
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -485,6 +485,18 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 
				 
			
 
				 	some_impl = 0;
			
 
				 	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
			
 
				+		if (cl->mpi_ms_funcs[i])
			
 
				+		{
			
 
				+			some_impl = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+	if (some_impl && is_where_unset)
			
 
				+	{
			
 
				+		cl->where |= STARPU_MPI_MS;
			
 
				+	}
			
 
				+
			
 
				+	some_impl = 0;
			
 
				+	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
			
 
				 		if (cl->scc_funcs[i])
			
 
				 		{
			
 
				 			some_impl = 1;
			
@@ -504,7 +516,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 
				 		}
			
 
				 	if (some_impl && is_where_unset)
			
 
				 	{
			
 
				-		cl->where |= STARPU_MIC|STARPU_SCC;
			
 
				+		cl->where |= STARPU_MIC|STARPU_SCC|STARPU_MPI_MS;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1146,6 +1158,7 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 
				 				case STARPU_CUDA_RAM:      /* Fall through */
			
 
				 				case STARPU_OPENCL_RAM:
			
 
				 				case STARPU_MIC_RAM:
			
 
				+                                case STARPU_MPI_MS_RAM:
			
 
				 				case STARPU_SCC_RAM:
			
 
				 					return 1;
			
 
				 				default:
			
@@ -1163,6 +1176,7 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 
				 				case STARPU_CUDA_RAM:
			
 
				 				case STARPU_OPENCL_RAM:
			
 
				 				case STARPU_MIC_RAM:
			
 
				+                                case STARPU_MPI_MS_RAM:
			
 
				 				case STARPU_SCC_RAM:
			
 
				 					return 0;
			
 
				 				default:
			
--- a/src/core/task.h
+++ b/src/core/task.h
@@ -111,6 +111,11 @@ static inline starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct s
 
				 	return cl->mic_funcs[nimpl];
			
 
				 }
			
 
				 
			
 
				+static inline starpu_mpi_ms_func_t _starpu_task_get_mpi_ms_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
			
 
				+{
			
 
				+	return cl->mpi_ms_funcs[nimpl];
			
 
				+}
			
 
				+
			
 
				 static inline starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
			
 
				 {
			
 
				 	return cl->scc_funcs[nimpl];
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -26,6 +26,8 @@
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				 #include <drivers/mic/driver_mic_source.h>
			
 
				 #include <drivers/scc/driver_scc_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				 #include <drivers/mp_common/source_common.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 #include <profiling/profiling.h>
			
@@ -58,7 +60,7 @@ static int nobind;
 
				 /* For checking whether two workers share the same PU, indexed by PU number */
			
 
				 static int cpu_worker[STARPU_MAXCPUS];
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 
			
 
				 struct handle_entry
			
 
				 {
			
@@ -81,6 +83,9 @@ static struct _starpu_worker_set cuda_worker_set[STARPU_MAXCUDADEVS];
 
				 #ifdef STARPU_USE_MIC
			
 
				 static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
			
 
				+#endif
			
 
				 
			
 
				 void *
			
 
				 _starpu_get_worker_from_driver(struct starpu_driver *d)
			
@@ -138,7 +143,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
				  * Discover the topology of the machine
			
 
				  */
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 static void
			
 
				 _starpu_initialize_workers_deviceid (int *explicit_workers_gpuid,
			
 
				 				  int *current, int *workers_gpuid,
			
@@ -395,6 +400,31 @@ static inline int _starpu_get_next_scc_deviceid(struct _starpu_machine_config *c
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+static inline int _starpu_get_next_mpi_deviceid(struct _starpu_machine_config *config)
			
 
				+{
			
 
				+	unsigned i = ((config->current_mpi_deviceid++) % config->topology.nmpidevices);
			
 
				+
			
 
				+	return (int)config->topology.workers_mpi_deviceid[i];
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+_starpu_init_mpi_topology (struct _starpu_machine_config *config, long mpi_idx)
			
 
				+{
			
 
				+	/* Discover the topology of the mpi node identifier by MPI_IDX. That
			
 
				+	 * means, make this StarPU instance aware of the number of cores available
			
 
				+	 * on this MPI device. Update the `nhwmpicores' topology field
			
 
				+	 * accordingly. */
			
 
				+
			
 
				+	struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+	int nbcores;
			
 
				+	_starpu_src_common_sink_nbcores (mpi_ms_nodes[mpi_idx], &nbcores);
			
 
				+	topology->nhwmpicores[mpi_idx] = nbcores;
			
 
				+}
			
 
				+
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 static void
			
 
				 _starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
			
@@ -583,6 +613,9 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
				 #ifdef STARPU_USE_SCC
			
 
				 	config->topology.nhwscc = _starpu_scc_src_get_device_count();
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE 
			
 
				+        config->topology.nhwmpi = _starpu_mpi_src_get_device_count();
			
 
				+#endif
			
 
				 
			
 
				 	topology_is_initialized = 1;
			
 
				 }
			
@@ -870,16 +903,75 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
				 	}
			
 
				 
			
 
				 	topology->nworkers += topology->nmiccores[mic_idx];
			
 
				-    }
			
 
				+}  
			
 
				 
			
 
				-#ifdef STARPU_USE_MIC
			
 
				 static COIENGINE mic_handles[STARPU_MAXMICDEVS];
			
 
				 COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        static void
			
 
				+_starpu_init_mpi_config (struct _starpu_machine_config *config,
			
 
				+                struct starpu_conf *user_conf,
			
 
				+                unsigned mpi_idx)
			
 
				+{
			
 
				+        struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+        topology->nhwmpicores[mpi_idx] = 0;
			
 
				+
			
 
				+        _starpu_init_mpi_topology (config, mpi_idx);
			
 
				+
			
 
				+        int nmpicores;
			
 
				+        nmpicores = starpu_get_env_number("STARPU_NMPIMSTHREADS");
			
 
				+
			
 
				+        if (nmpicores == -1)
			
 
				+        {
			
 
				+                /* Nothing was specified, so let's use the number of
			
 
				+                 * detected mpi cores. ! */
			
 
				+                nmpicores = topology->nhwmpicores[mpi_idx];
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+                if ((unsigned) nmpicores > topology->nhwmpicores[mpi_idx])
			
 
				+                {
			
 
				+                        /* The user requires more MPI cores than there is available */
			
 
				+                        fprintf(stderr,
			
 
				+                                        "# Warning: %d MPI cores requested. Only %d available.\n",
			
 
				+                                        nmpicores, topology->nhwmpicores[mpi_idx]);
			
 
				+                        nmpicores = topology->nhwmpicores[mpi_idx];
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        topology->nmpicores[mpi_idx] = nmpicores;
			
 
				+        STARPU_ASSERT_MSG(topology->nmpicores[mpi_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
			
 
				+                        "topology->nmpicores[mpi_idx(%d)] (%d) + topology->nworkers (%d) <= STARPU_NMAXWORKERS (%d)",
			
 
				+                        mpi_idx, topology->nmpicores[mpi_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				+
			
 
				+        mpi_worker_set[mpi_idx].workers = &config->workers[topology->nworkers];
			
 
				+        unsigned mpicore_id;
			
 
				+        for (mpicore_id = 0; mpicore_id < topology->nmpicores[mpi_idx]; mpicore_id++)
			
 
				+        {
			
 
				+                int worker_idx = topology->nworkers + mpicore_id;
			
 
				+                config->workers[worker_idx].set = &mpi_worker_set[mpi_idx];
			
 
				+                config->workers[worker_idx].arch = STARPU_MPI_WORKER;
			
 
				+                _STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
			
 
				+                config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				+                config->workers[worker_idx].perf_arch.devices[0].type = STARPU_MPI_WORKER;
			
 
				+                config->workers[worker_idx].perf_arch.devices[0].devid = mpi_idx;
			
 
				+                config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				+                config->workers[worker_idx].devid = mpi_idx;
			
 
				+                config->workers[worker_idx].subworkerid = mpicore_id;
			
 
				+                config->workers[worker_idx].worker_mask = STARPU_MPI_MS;
			
 
				+                config->worker_mask |= STARPU_MPI_MS;
			
 
				+        }
			
 
				+
			
 
				+        topology->nworkers += topology->nmpicores[mpi_idx];
			
 
				+}  
			
 
				+#endif
			
 
				+
			
 
				 static void
			
 
				 _starpu_init_mp_config (struct _starpu_machine_config *config,
			
 
				-			struct starpu_conf *user_conf)
			
 
				+			struct starpu_conf *user_conf, int no_mp_config)
			
 
				 {
			
 
				 	/* Discover and configure the mp topology. That means:
			
 
				 	 * - discover the number of mp nodes;
			
@@ -890,20 +982,20 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 
				 
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-	// We currently only support MIC at this level.
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-
			
 
				-	/* Discover and initialize the number of MIC nodes through the mp
			
 
				-	 * infrastructure. */
			
 
				-	unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
			
 
				-
			
 
				-	int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
			
 
				-	if (reqmicdevices == -1 && user_conf)
			
 
				-		reqmicdevices = user_conf->nmic;
			
 
				-	if (reqmicdevices == -1)
			
 
				-		/* Nothing was specified, so let's use the number of
			
 
				-		 * detected mic devices. ! */
			
 
				-		reqmicdevices = nhwmicdevices;
			
 
				+    if (!no_mp_config)
			
 
				+    {
			
 
				+        /* Discover and initialize the number of MIC nodes through the mp
			
 
				+         * infrastructure. */
			
 
				+        unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
			
 
				+
			
 
				+        int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
			
 
				+        if (reqmicdevices == -1 && user_conf)
			
 
				+            reqmicdevices = user_conf->nmic;
			
 
				+        if (reqmicdevices == -1)
			
 
				+            /* Nothing was specified, so let's use the number of
			
 
				+             * detected mic devices. ! */
			
 
				+            reqmicdevices = nhwmicdevices;
			
 
				 
			
 
				 	if (reqmicdevices != -1)
			
 
				 	{
			
@@ -915,18 +1007,67 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	topology->nmicdevices = 0;
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < (unsigned) reqmicdevices; i++)
			
 
				-		if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
			
 
				-			topology->nmicdevices++;
			
 
				+        topology->nmicdevices = 0;
			
 
				+        unsigned i;
			
 
				+        for (i = 0; i < (unsigned) reqmicdevices; i++)
			
 
				+                if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
			
 
				+                        topology->nmicdevices++;
			
 
				 
			
 
				 
			
 
				-	for (i = 0; i < topology->nmicdevices; i++)
			
 
				-		_starpu_init_mic_config (config, user_conf, i);
			
 
				+        for (i = 0; i < topology->nmicdevices; i++)
			
 
				+                _starpu_init_mic_config (config, user_conf, i);
			
 
				+    }
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+    {
			
 
				+            /* Discover and initialize the number of MPI nodes through the mp
			
 
				+             * infrastructure. */
			
 
				+            unsigned nhwmpidevices = _starpu_mpi_src_get_device_count();
			
 
				+
			
 
				+            int reqmpidevices = starpu_get_env_number("STARPU_NMPI_MS");
			
 
				+            if (reqmpidevices == -1 && user_conf)
			
 
				+                    reqmpidevices = user_conf->nmpi_ms;
			
 
				+            if (reqmpidevices == -1)
			
 
				+                    /* Nothing was specified, so let's use the number of
			
 
				+                     * detected mpi devices. ! */
			
 
				+                    reqmpidevices = nhwmpidevices;
			
 
				+
			
 
				+            if (reqmpidevices != -1)
			
 
				+            {
			
 
				+                    if ((unsigned) reqmpidevices > nhwmpidevices)
			
 
				+                    {
			
 
				+                            /* The user requires more MPI devices than there is available */
			
 
				+                            fprintf(stderr,
			
 
				+                                            "# Warning: %d MPI Master-Slave devices requested. Only %d available.\n",
			
 
				+                                            reqmpidevices, nhwmpidevices);
			
 
				+                            reqmpidevices = nhwmpidevices;
			
 
				+                    }
			
 
				+            }
			
 
				+
			
 
				+            topology->nmpidevices = reqmpidevices;
			
 
				+
			
 
				+            /* if user don't want to use MPI slaves, we close the slave processes */
			
 
				+            if (no_mp_config && topology->nmpidevices == 0)
			
 
				+            {
			
 
				+                    _starpu_mpi_common_mp_deinit();
			
 
				+                    exit(0);
			
 
				+            }
			
 
				+
			
 
				+            if (!no_mp_config)
			
 
				+            {
			
 
				+                    unsigned i;
			
 
				+                    for (i = 0; i < topology->nmpidevices; i++)
			
 
				+                            mpi_ms_nodes[i] = _starpu_mp_common_node_create(STARPU_NODE_MPI_SOURCE, i);
			
 
				+
			
 
				+
			
 
				+                    for (i = 0; i < topology->nmpidevices; i++)
			
 
				+                            _starpu_init_mpi_config (config, user_conf, i);
			
 
				+            }
			
 
				+    }
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_MIC
			
 
				 static void
			
 
				 _starpu_deinit_mic_node (unsigned mic_idx)
			
 
				 {
			
@@ -936,6 +1077,17 @@ _starpu_deinit_mic_node (unsigned mic_idx)
 
				 
			
 
				 	_starpu_mp_common_node_destroy(mic_nodes[mic_idx]);
			
 
				 }
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+static void _starpu_deinit_mpi_node(int devid)
			
 
				+{
			
 
				+        _starpu_mp_common_send_command(mpi_ms_nodes[devid], STARPU_MP_COMMAND_EXIT, NULL, 0);                          
			
 
				+
			
 
				+        _starpu_mp_common_node_destroy(mpi_ms_nodes[devid]);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 
			
 
				 static void
			
 
				 _starpu_deinit_mp_config (struct _starpu_machine_config *config)
			
@@ -943,11 +1095,16 @@ _starpu_deinit_mp_config (struct _starpu_machine_config *config)
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 	unsigned i;
			
 
				 
			
 
				+#ifdef STARPU_USE_MIC
			
 
				 	for (i = 0; i < topology->nmicdevices; i++)
			
 
				 		_starpu_deinit_mic_node (i);
			
 
				 	_starpu_mic_clear_kernels();
			
 
				-}
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	for (i = 0; i < topology->nmpidevices; i++)
			
 
				+		_starpu_deinit_mpi_node (i);
			
 
				+#endif
			
 
				+}
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 static unsigned
			
@@ -1006,6 +1163,10 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 	for (i = 0; i < (int) (sizeof(mic_worker_set)/sizeof(mic_worker_set[0])); i++)
			
 
				 		mic_worker_set[i].workers = NULL;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	for (i = 0; i < (int) (sizeof(mpi_worker_set)/sizeof(mpi_worker_set[0])); i++)
			
 
				+		mpi_worker_set[i].workers = NULL;
			
 
				+#endif
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	int ncuda = config->conf.ncuda;
			
@@ -1243,7 +1404,7 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 
			
 
				 		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].type = STARPU_SCC_WORKER;
			
 
				 		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].devid = sccdev;
			
 
				-		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncore = 1;
			
 
				+		config->workers[topology->nworkers + sccdev].perf_arch.devices[0].ncores = 1;
			
 
				 		config->workers[topology->nworkers + sccdev].subworkerid = 0;
			
 
				 		config->workers[topology->nworkers + sccdev].devid = devid;
			
 
				 		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
			
@@ -1256,12 +1417,8 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 	topology->nworkers += topology->nsccdevices;
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				-
			
 
				-	/* Unless not requested, we need to complete configuration with the
			
 
				-	 * ones of the mp nodes. */
			
 
				-#ifdef STARPU_USE_MIC
			
 
				-	if (! no_mp_config)
			
 
				-	    _starpu_init_mp_config (config, &config->conf);
			
 
				+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				+	    _starpu_init_mp_config (config, &config->conf, no_mp_config);
			
 
				 #endif
			
 
				 
			
 
				 /* we put the CPU section after the accelerator : in case there was an
			
@@ -1278,7 +1435,17 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 			for (j = 0; j < STARPU_MAXMICDEVS; j++)
			
 
				 				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
			
 
				 
			
 
				-			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
			
 
				+            unsigned mpi_ms_busy_cpus = 0;
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+#ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+            for (j = 0; j < STARPU_MAXMPIDEVS; j++)
			
 
				+                    mpi_ms_busy_cpus += (topology->nmpicores[j] ? 1 : 0);
			
 
				+#else
			
 
				+            mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
			
 
				+#endif
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				+			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus + topology->ncudagpus
			
 
				 				+ topology->nopenclgpus + topology->nsccdevices;
			
 
				 
			
 
				 			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
			
@@ -1563,6 +1730,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 	unsigned mic_memory_nodes[STARPU_MAXMICDEVS];
			
 
				 	unsigned mic_bindid[STARPU_MAXMICDEVS];
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	unsigned mpi_init[STARPU_MAXMPIDEVS] = { };
			
 
				+	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
			
 
				+	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
			
 
				+#endif
			
 
				 	unsigned bindid;
			
 
				 
			
 
				 	for (bindid = 0; bindid < config->nbindid; bindid++)
			
@@ -1579,7 +1751,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 		struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				 		unsigned devid = workerarg->devid;
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 		/* Perhaps the worker has some "favourite" bindings  */
			
 
				 		int *preferred_binding = NULL;
			
 
				 		int npreferred = 0;
			
@@ -1610,6 +1782,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				+
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				 				if (memory_node != STARPU_MAIN_RAM)
			
@@ -1701,6 +1877,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					}
			
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
@@ -1740,6 +1919,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 #endif /* SIMGRID */
			
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
@@ -1771,6 +1953,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				workerarg->bindid = mic_bindid[devid];
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				 				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
@@ -1787,13 +1972,59 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 
			
 
				 				memory_node = ram_memory_node;
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				 				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				 #endif
			
 
				 			}
			
 
				 				break;
			
 
				+#endif /* STARPU_USE_SCC */
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+			case STARPU_MPI_WORKER:
			
 
				+			{
			
 
				+				if (mpi_init[devid])
			
 
				+				{
			
 
				+					memory_node = mpi_memory_nodes[devid];
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					mpi_init[devid] = 1;
			
 
				+					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
			
 
				+					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				+					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+
			
 
				+				}
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+                                /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
			
 
				+                                unsigned findworker;
			
 
				+                                for (findworker = 0; findworker < worker; findworker++)
			
 
				+                                {
			
 
				+                                        struct _starpu_worker *findworkerarg = &config->workers[findworker];
			
 
				+                                        if (findworkerarg->arch == STARPU_MPI_WORKER)
			
 
				+                                        {
			
 
				+                                                _starpu_worker_drives_memory_node(workerarg->workerid, findworkerarg->memory_node);
			
 
				+                                                _starpu_worker_drives_memory_node(findworkerarg->workerid, memory_node);
			
 
				+                                        }
			
 
				+                                }
			
 
				+#endif
			
 
				+                
			
 
				+				workerarg->bindid = mpi_bindid[devid];
			
 
				+				_starpu_memory_node_add_nworkers(memory_node);
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				+				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				 #endif
			
 
				+				break;
			
 
				+			}
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				 
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
@@ -1885,6 +2116,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
				 	config->opencl_nodeid = -1;
			
 
				 	config->mic_nodeid = -1;
			
 
				 	config->scc_nodeid = -1;
			
 
				+        config->mpi_nodeid = -1;
			
 
				 	for (i = 0; i < starpu_worker_get_count(); i++)
			
 
				 	{
			
 
				 		switch (starpu_worker_get_type(i))
			
@@ -1919,6 +2151,12 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
				 				else if (config->scc_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				 					config->scc_nodeid = -2;
			
 
				 				break;
			
 
				+			case STARPU_MPI_WORKER:
			
 
				+				if (config->mpi_nodeid == -1)
			
 
				+					config->mpi_nodeid = starpu_worker_get_memory_node(i);
			
 
				+				else if (config->mpi_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				+					config->mpi_nodeid = -2;
			
 
				+				break;
			
 
				 			case STARPU_ANY_WORKER:
			
 
				 				STARPU_ASSERT(0);
			
 
				 		}
			
@@ -1929,7 +2167,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
				 
			
 
				 void _starpu_destroy_topology(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-#ifdef STARPU_USE_MIC
			
 
				+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 	_starpu_deinit_mp_config(config);
			
 
				 #endif
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -40,6 +40,7 @@
 
				 #include <top/starpu_top_core.h>
			
 
				 #include <drivers/mp_common/sink_common.h>
			
 
				 #include <drivers/scc/driver_scc_common.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				 
			
 
				 #include <drivers/cpu/driver_cpu.h>
			
 
				 #include <drivers/cuda/driver_cuda.h>
			
@@ -142,6 +143,10 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
				 				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mic_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
 
				 				break;
			
 
				+                        case STARPU_MPI_WORKER:
			
 
				+                                if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mpi_ms_funcs[impl] != NULL)
			
 
				+                                        test_implementation = 1;
			
 
				+                                break;
			
 
				 			case STARPU_SCC_WORKER:
			
 
				 				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->scc_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
@@ -205,6 +210,11 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_MIC_WORKER))
			
 
				 		return 1;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	if ((task->cl->where & STARPU_MPI_MS) &&
			
 
				+	    _starpu_worker_exists_and_can_execute(task, STARPU_MPI_WORKER))
			
 
				+		return 1;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_SCC
			
 
				 	if ((task->cl->where & STARPU_SCC) &&
			
 
				 	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
			
@@ -280,6 +290,13 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
				 
			
 
				 		return func != NULL || func_name != NULL;
			
 
				 	}
			
 
				+	case STARPU_MPI_WORKER:
			
 
				+	{
			
 
				+		starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(cl, nimpl);
			
 
				+		const char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
			
 
				+
			
 
				+		return func != NULL || func_name != NULL;
			
 
				+	}
			
 
				 	case STARPU_SCC_WORKER:
			
 
				 	{
			
 
				 		starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(cl, nimpl);
			
@@ -521,6 +538,9 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	starpu_pthread_wait_init(&workerarg->wait);
			
 
				 	starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_task_queue[workerarg->workerid]);
			
 
				 #endif
			
 
				+        workerarg->task_sending = NULL;
			
 
				+        workerarg->nb_buffers_sent = 0;
			
 
				+
			
 
				 	workerarg->first_task = 0;
			
 
				 	workerarg->ntasks = 0;
			
 
				 	/* set initialized by topology.c */
			
@@ -591,7 +611,6 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 
				 	_starpu_fxt_register_thread(worker->bindid);
			
 
				 	_starpu_worker_start(worker, fut_key, sync);
			
 
				 #endif
			
 
				-
			
 
				 	_starpu_memory_node_set_local_key(&worker->memory_node);
			
 
				 
			
 
				 	_starpu_set_local_worker_key(worker);
			
@@ -639,7 +658,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
 
				 		unsigned devid = workerarg->devid;
			
 
				-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 		struct _starpu_worker_set *worker_set = workerarg->set;
			
 
				 #endif
			
 
				 
			
@@ -808,13 +827,91 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				 #endif
			
 
				 				break;
			
 
				+#endif /* STARPU_USE_SCC */
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+			case STARPU_MPI_WORKER:
			
 
				+				/* We spawn only one thread
			
 
				+				 * per MPI device, which will control all MPI
			
 
				+				 * workers of this device. (by using a worker set). */
			
 
				+				if (worker_set->workers != workerarg)
			
 
				+					break;
			
 
				+
			
 
				+				worker_set->nworkers = pconfig->topology.nmpicores[devid];
			
 
				+
			
 
				+				worker_set->set_is_initialized = 0;
			
 
				+
			
 
				+#ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+                /* if MPI has multiple threads supports
			
 
				+                 * we launch 1 thread per device 
			
 
				+                 * else 
			
 
				+                 * we launch one thread for all devices
			
 
				+                 */
			
 
				+				STARPU_PTHREAD_CREATE_ON(
			
 
				+						workerarg->name,
			
 
				+						&worker_set->worker_thread,
			
 
				+						NULL,
			
 
				+						_starpu_mpi_src_worker,
			
 
				+						worker_set,
			
 
				+						_starpu_simgrid_get_host_by_worker(workerarg));
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				+				while (!workerarg->worker_is_running)
			
 
				+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				 #endif
			
 
				 
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
			
 
				+				while (!worker_set->set_is_initialized)
			
 
				+					STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
			
 
				+								  &worker_set->mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
			
 
				+
			
 
				+				worker_set->started = 1;
			
 
				+#endif /* STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD */
			
 
				+
			
 
				+				break;
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+        if (pconfig->topology.nmpidevices > 0)
			
 
				+        {
			
 
				+                struct _starpu_worker_set * worker_set_zero = &mpi_worker_set[0];
			
 
				+                struct _starpu_worker * worker_zero = &worker_set_zero->workers[0];
			
 
				+                STARPU_PTHREAD_CREATE_ON(
			
 
				+                                worker_zero->name,
			
 
				+                                &worker_set_zero->worker_thread,
			
 
				+                                NULL,
			
 
				+                                _starpu_mpi_src_worker,
			
 
				+                                &mpi_worker_set,
			
 
				+                                _starpu_simgrid_get_host_by_worker(worker_zero));
			
 
				+
			
 
				+                /* We use the first worker to know if everything are finished */
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&worker_zero->mutex);
			
 
				+                while (!worker_zero->worker_is_running)
			
 
				+                        STARPU_PTHREAD_COND_WAIT(&worker_zero->started_cond, &worker_zero->mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_zero->mutex);
			
 
				+#endif
			
 
				+
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&worker_set_zero->mutex);
			
 
				+                while (!worker_set_zero->set_is_initialized)
			
 
				+                        STARPU_PTHREAD_COND_WAIT(&worker_set_zero->ready_cond,
			
 
				+                                        &worker_set_zero->mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set_zero->mutex);
			
 
				+
			
 
				+                worker_set_zero->started = 1;
			
 
				+                worker_set_zero->worker_thread = mpi_worker_set[0].worker_thread;
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
@@ -869,6 +966,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				break;
			
 
				 #endif
			
 
				 			case STARPU_MIC_WORKER:
			
 
				+                        case STARPU_MPI_WORKER:
			
 
				 				/* Already waited above */
			
 
				 				break;
			
 
				 			case STARPU_SCC_WORKER:
			
@@ -911,6 +1009,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
			
 
				 	conf->nmic = starpu_get_env_number("STARPU_NMIC");
			
 
				 	conf->nscc = starpu_get_env_number("STARPU_NSCC");
			
 
				+	conf->nmpi_ms = starpu_get_env_number("STARPU_NMPI_MS");
			
 
				 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
			
 
				 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
			
 
				 	conf->mic_sink_program_path = starpu_getenv("STARPU_MIC_PROGRAM_PATH");
			
@@ -926,6 +1025,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_scc_deviceid = 0; /* TODO */
			
 
				+	conf->use_explicit_workers_mpi_deviceid = 0; /* TODO */
			
 
				 
			
 
				 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
			
 
				 	if (conf->single_combined_worker == -1)
			
@@ -963,6 +1063,14 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 		conf->disable_asynchronous_mic_copy = 0;
			
 
				 #endif
			
 
				 
			
 
				+#if defined(STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY)
			
 
				+    conf->disable_asynchronous_mpi_ms_copy = 1;
			
 
				+#else
			
 
				+    conf->disable_asynchronous_mpi_ms_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY");
			
 
				+    if(conf->disable_asynchronous_mpi_ms_copy == -1)
			
 
				+        conf->disable_asynchronous_mpi_ms_copy = 0;
			
 
				+#endif
			
 
				+
			
 
				 	/* 64MiB by default */
			
 
				 	conf->trace_buffer_size = starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64) << 20;
			
 
				 	return 0;
			
@@ -1007,6 +1115,7 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 
				 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY", &conf->disable_asynchronous_cuda_copy);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY", &conf->disable_asynchronous_opencl_copy);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY", &conf->disable_asynchronous_mic_copy);
			
 
				+	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY", &conf->disable_asynchronous_mpi_ms_copy);
			
 
				 }
			
 
				 
			
 
				 struct starpu_tree* starpu_workers_get_tree(void)
			
@@ -1109,6 +1218,18 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 		setenv("STARPU_SINK", "STARPU_SCC", 1);
			
 
				 #	endif
			
 
				 
			
 
				+#       ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        if (_starpu_mpi_common_mp_init() == -ENODEV)
			
 
				+        {
			
 
				+                initialized = UNINITIALIZED;
			
 
				+                return -ENODEV;
			
 
				+        }
			
 
				+
			
 
				+        /* In MPI case we look at the rank to know if we are a sink */
			
 
				+        if (!_starpu_mpi_common_is_src_node())
			
 
				+                setenv("STARPU_SINK", "STARPU_MPI_MS", 1);
			
 
				+#       endif
			
 
				+
			
 
				 	/* If StarPU was configured to use MP sinks, we have to control the
			
 
				 	 * kind on node we are running on : host or sink ? */
			
 
				 	if (starpu_getenv("STARPU_SINK"))
			
@@ -1243,7 +1364,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 	/* Depending on whether we are a MP sink or not, we must build the
			
 
				 	 * topology with MP nodes or not. */
			
 
				 	ret = _starpu_build_topology(&_starpu_config, is_a_sink);
			
 
				-	if (ret)
			
 
				+    /* sink doesn't exit even if no worker discorvered */
			
 
				+	if (ret && !is_a_sink)
			
 
				 	{
			
 
				 		starpu_perfmodel_free_sampling_directories();
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
			
@@ -1255,6 +1377,11 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 		if (_starpu_scc_common_is_mp_initialized())
			
 
				 			_starpu_scc_src_mp_deinit();
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+                if (_starpu_mpi_common_is_mp_initialized())
			
 
				+                        _starpu_mpi_common_mp_deinit();
			
 
				+#endif
			
 
				+
			
 
				 		initialized = UNINITIALIZED;
			
 
				 		/* Let somebody else try to do it */
			
 
				 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
			
@@ -1602,6 +1729,10 @@ void starpu_shutdown(void)
 
				 	if (_starpu_scc_common_is_mp_initialized())
			
 
				 		_starpu_scc_src_mp_deinit();
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+    if (_starpu_mpi_common_is_mp_initialized())
			
 
				+        _starpu_mpi_common_mp_deinit();
			
 
				+#endif 
			
 
				 	_starpu_print_idle_time();
			
 
				 	_STARPU_DEBUG("Shutdown finished\n");
			
 
				 
			
@@ -1646,12 +1777,16 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 
				 		case STARPU_SCC_WORKER:
			
 
				 			return _starpu_config.topology.nsccdevices;
			
 
				 
			
 
				-		case STARPU_ANY_WORKER:
			
 
				-			return _starpu_config.topology.ncpus+
			
 
				-			       _starpu_config.topology.ncudagpus+
			
 
				-			       _starpu_config.topology.nopenclgpus+
			
 
				-			       _starpu_config.topology.nmicdevices+
			
 
				-			       _starpu_config.topology.nsccdevices;
			
 
				+                case STARPU_MPI_WORKER:
			
 
				+                        return _starpu_config.topology.nmpidevices;
			
 
				+
			
 
				+                case STARPU_ANY_WORKER:
			
 
				+                        return _starpu_config.topology.ncpus+
			
 
				+                                _starpu_config.topology.ncudagpus+
			
 
				+                                _starpu_config.topology.nopenclgpus+
			
 
				+                                _starpu_config.topology.nmicdevices+
			
 
				+                                _starpu_config.topology.nsccdevices+
			
 
				+                                _starpu_config.topology.nmpidevices;
			
 
				 		default:
			
 
				 			return -EINVAL;
			
 
				 	}
			
@@ -1697,6 +1832,11 @@ int starpu_asynchronous_mic_copy_disabled(void)
 
				 	return _starpu_config.conf.disable_asynchronous_mic_copy;
			
 
				 }
			
 
				 
			
 
				+int starpu_asynchronous_mpi_ms_copy_disabled(void)
			
 
				+{
			
 
				+        return _starpu_config.conf.disable_asynchronous_mpi_ms_copy;
			
 
				+}
			
 
				+
			
 
				 unsigned starpu_mic_worker_get_count(void)
			
 
				 {
			
 
				 	int i = 0, count = 0;
			
@@ -1712,6 +1852,11 @@ unsigned starpu_scc_worker_get_count(void)
 
				 	return _starpu_config.topology.nsccdevices;
			
 
				 }
			
 
				 
			
 
				+unsigned starpu_mpi_ms_worker_get_count(void)
			
 
				+{
			
 
				+        return _starpu_config.topology.nmpidevices;
			
 
				+}
			
 
				+
			
 
				 /* When analyzing performance, it is useful to see what is the processing unit
			
 
				  * that actually performed the task. This function returns the id of the
			
 
				  * processing unit actually executing it, therefore it makes no sense to use it
			
@@ -2239,6 +2384,7 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 
				 	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
			
 
				 	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
			
 
				 	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
			
 
				+        if (type == STARPU_MPI_WORKER) return "STARPU_MPI_WORKER";
			
 
				 	if (type == STARPU_SCC_WORKER) return "STARPU_SCC_WORKER";
			
 
				 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
			
 
				 	return "STARPU_unknown_WORKER";
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2011  INRIA
			
 
				+ * Copyright (C) 2011, 2016  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -47,6 +47,9 @@
 
				 #include <drivers/scc/driver_scc_source.h>
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#endif
			
 
				 
			
 
				 #include <drivers/cpu/driver_cpu.h>
			
 
				 
			
@@ -112,6 +115,8 @@ LIST_TYPE(_starpu_worker,
 
				 
			
 
				 	unsigned spinning_backoff ; /* number of cycles to pause when spinning  */
			
 
				 
			
 
				+        unsigned nb_buffers_sent; /* number of piece of data already send to remote side */
			
 
				+        struct starpu_task *task_sending; /* The buffers of this task are being sent */
			
 
				 
			
 
				 	/* indicate whether the workers shares tasks lists with other workers*/
			
 
				 	/* in this case when removing him from a context it disapears instantly */
			
@@ -180,6 +185,10 @@ struct _starpu_worker_set
 
				 	unsigned set_is_initialized;
			
 
				 };
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+extern struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
			
 
				+#endif
			
 
				+
			
 
				 struct _starpu_machine_topology
			
 
				 {
			
 
				 	/* Total number of workers. */
			
@@ -222,6 +231,11 @@ struct _starpu_machine_topology
 
				 	 */
			
 
				 	unsigned nhwscc;
			
 
				 
			
 
				+	/* Total number of MPI nodes, as detected. May be different
			
 
				+	 * from the actual number of node workers.
			
 
				+	 */
			
 
				+	unsigned nhwmpi;
			
 
				+
			
 
				 	/* Actual number of CPU workers used by StarPU. */
			
 
				 	unsigned ncpus;
			
 
				 
			
@@ -234,6 +248,13 @@ struct _starpu_machine_topology
 
				 	/* Actual number of SCC workers used by StarPU. */
			
 
				 	unsigned nsccdevices;
			
 
				 
			
 
				+	/* Actual number of MPI workers used by StarPU. */
			
 
				+	unsigned nmpidevices;
			
 
				+        unsigned nhwmpidevices;
			
 
				+
			
 
				+	unsigned nhwmpicores[STARPU_MAXMPIDEVS]; // Each MPI node has its set of cores.
			
 
				+	unsigned nmpicores[STARPU_MAXMPIDEVS];
			
 
				+
			
 
				 	/* Topology of MP nodes (mainly MIC and SCC) as well as necessary
			
 
				 	 * objects to communicate with them. */
			
 
				 	unsigned nhwmicdevices;
			
@@ -283,6 +304,8 @@ struct _starpu_machine_topology
 
				 	 * are taken in ID order.
			
 
				 	 */
			
 
				 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config
			
@@ -309,6 +332,9 @@ struct _starpu_machine_config
 
				 	/* Which SCC do we use? */
			
 
				 	int current_scc_deviceid;
			
 
				 
			
 
				+	/* Which MPI do we use? */
			
 
				+	int current_mpi_deviceid;
			
 
				+
			
 
				 	/* Memory node for cpus, if only one */
			
 
				 	int cpus_nodeid;
			
 
				 	/* Memory node for CUDA, if only one */
			
@@ -319,6 +345,8 @@ struct _starpu_machine_config
 
				 	int mic_nodeid;
			
 
				 	/* Memory node for SCC, if only one */
			
 
				 	int scc_nodeid;
			
 
				+	/* Memory node for MPI, if only one */
			
 
				+	int mpi_nodeid;
			
 
				 
			
 
				 	/* Basic workers : each of this worker is running its own driver and
			
 
				 	 * can be combined with other basic workers. */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -146,7 +146,8 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 
			
 
				 			if (starpu_node_get_kind(i) == STARPU_CPU_RAM || 
			
 
				 			    starpu_node_get_kind(i) == STARPU_SCC_RAM ||
			
 
				-			    starpu_node_get_kind(i) == STARPU_SCC_SHM)
			
 
				+			    starpu_node_get_kind(i) == STARPU_SCC_SHM ||
			
 
				+                            starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
			
 
				 				i_ram = i;
			
 
				 			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)			
			
 
				 				i_disk = i;
			
@@ -259,6 +260,11 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 		case STARPU_MIC_RAM:
			
 
				 			/* TODO: We don't handle direct MIC-MIC transfers yet */
			
 
				 			return 0;
			
 
				+                case STARPU_MPI_MS_RAM:
			
 
				+                {
			
 
				+                        enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
			
 
				+                        return kind == STARPU_MPI_MS_RAM;
			
 
				+                }
			
 
				 		case STARPU_SCC_RAM:
			
 
				 			return 1;
			
 
				 		default:
			
@@ -717,14 +723,13 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
 
				-	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				 	int cpt = 0;
			
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(local_node, 1);
			
 
				+		_starpu_datawizard_progress(1);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -808,12 +813,11 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 	if ((wt_mask & ~(1<<memory_node)))
			
 
				 		_starpu_write_through_data(handle, memory_node, wt_mask);
			
 
				 
			
 
				-	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				 	int cpt = 0;
			
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(local_node, 1);
			
 
				+		_starpu_datawizard_progress(1);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -831,12 +835,11 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 
			
 
				 static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				-	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				 	int cpt = 0;
			
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(local_node, 1);
			
 
				+		_starpu_datawizard_progress(1);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
@@ -909,7 +912,7 @@ int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned n
 
				 	return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
			
 
				 }
			
 
				 
			
 
				-static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
			
 
				+struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
			
 
				 {
			
 
				 	if (mode & (STARPU_SCRATCH|STARPU_REDUX))
			
 
				 	{
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -324,5 +324,6 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle);
 
				 void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle);
			
 
				 
			
 
				 void _starpu_data_set_unregister_hook(starpu_data_handle_t handle, _starpu_data_handle_unregister_hook func);
			
 
				+struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node);
			
 
				 
			
 
				 #endif // __COHERENCY__H__
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2013, 2016  CNRS
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,6 +23,9 @@
 
				 #include <datawizard/datastats.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <drivers/disk/driver_disk.h>
			
 
				+#include <drivers/mpi/driver_mpi_sink.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include "copy_driver.h"
			
 
				 #include "memalloc.h"
			
@@ -420,6 +424,79 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 		break;
			
 
				 	/* TODO: MIC -> MIC */
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MPI_MS_RAM):
			
 
				+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
			
 
				+                                !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
			
 
				+                {
			
 
				+                        /* this is not associated to a request so it's synchronous */
			
 
				+                        STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
			
 
				+                        if (copy_methods->ram_to_mpi_ms)
			
 
				+                                copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
			
 
				+                        else
			
 
				+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                        req->async_channel.type = STARPU_MPI_MS_RAM;
			
 
				+                        if(copy_methods->ram_to_mpi_ms_async)
			
 
				+                                ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        else
			
 
				+                        {
			
 
				+                                STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        }
			
 
				+                }
			
 
				+                break;
			
 
				+
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_CPU_RAM):
			
 
				+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
			
 
				+                                !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
			
 
				+                {
			
 
				+                        /* this is not associated to a request so it's synchronous */
			
 
				+                        STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
			
 
				+                        if (copy_methods->mpi_ms_to_ram)
			
 
				+                                copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+                        else
			
 
				+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                        req->async_channel.type = STARPU_MPI_MS_RAM;
			
 
				+                        if(copy_methods->mpi_ms_to_ram_async)
			
 
				+                                ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        else
			
 
				+                        {
			
 
				+                                STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        }
			
 
				+                }
			
 
				+                break;
			
 
				+
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_MPI_MS_RAM):
			
 
				+                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
			
 
				+                                !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
			
 
				+                {
			
 
				+                        /* this is not associated to a request so it's synchronous */
			
 
				+                        STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
			
 
				+                        if (copy_methods->mpi_ms_to_mpi_ms)
			
 
				+                                copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
			
 
				+                        else
			
 
				+                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                        req->async_channel.type = STARPU_MPI_MS_RAM;
			
 
				+                        if(copy_methods->mpi_ms_to_mpi_ms_async)
			
 
				+                                ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        else
			
 
				+                        {
			
 
				+                                STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+                        }
			
 
				+                }
			
 
				+                break;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_SCC
			
 
				 		/* SCC RAM associated to the master process is considered as
			
 
				 		 * the main memory node. */
			
@@ -660,6 +737,43 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
				 				(void*) (dst + dst_offset), dst_node,
			
 
				 				size);
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_MPI_MS_RAM):
			
 
				+                if (async_data)
			
 
				+                        return _starpu_mpi_copy_ram_to_mpi_async(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size, async_data);
			
 
				+                else
			
 
				+                        return _starpu_mpi_copy_ram_to_mpi_sync(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size);
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_CPU_RAM):
			
 
				+                if (async_data)
			
 
				+                        return _starpu_mpi_copy_mpi_to_ram_async(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size, async_data);
			
 
				+                else
			
 
				+                        return _starpu_mpi_copy_mpi_to_ram_sync(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size);
			
 
				+
			
 
				+        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_MPI_MS_RAM):
			
 
				+                if (async_data)
			
 
				+                        return _starpu_mpi_copy_sink_to_sink_async(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size, async_data);
			
 
				+                else
			
 
				+                        return _starpu_mpi_copy_sink_to_sink_sync(
			
 
				+                                        (void*) (src + src_offset), src_node,
			
 
				+                                        (void*) (dst + dst_offset), dst_node,
			
 
				+                                        size);
			
 
				+#endif
			
 
				+
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
			
 
				 	{
			
 
				 		return _starpu_disk_copy_src_to_disk(
			
@@ -736,6 +850,11 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 
				 		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
			
 
				 		break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        case STARPU_MPI_MS_RAM:
			
 
				+                _starpu_mpi_common_wait_event(async_channel);
			
 
				+                break;
			
 
				+#endif
			
 
				 	case STARPU_MAIN_RAM:
			
 
				 		starpu_disk_wait_request(async_channel);
			
 
				 		break;
			
@@ -800,6 +919,11 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 
				 		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
			
 
				 		break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        case STARPU_MPI_MS_RAM:
			
 
				+                success = _starpu_mpi_common_test_event(async_channel);
			
 
				+                break;
			
 
				+#endif
			
 
				 	case STARPU_DISK_RAM:
			
 
				 		success = starpu_disk_test_request(async_channel);
			
 
				 		break;
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2013, 2015  CNRS
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +24,7 @@
 
				 #endif
			
 
				 
			
 
				 #include <common/config.h>
			
 
				+#include <common/list.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
@@ -34,6 +36,10 @@
 
				 #include <starpu_opencl.h>
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+#include <mpi.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
 
				 {
			
@@ -54,6 +60,18 @@ struct _starpu_mic_async_event
 
				 };
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+LIST_TYPE(_starpu_mpi_ms_event_request,
			
 
				+        MPI_Request request;
			
 
				+);
			
 
				+
			
 
				+struct _starpu_mpi_ms_async_event
			
 
				+{
			
 
				+        int is_sender;
			
 
				+        struct _starpu_mpi_ms_event_request_list * requests;
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				 struct _starpu_disk_async_event
			
 
				 {
			
 
				 	unsigned memory_node;
			
@@ -73,21 +91,30 @@ union _starpu_async_channel_event
 
				 	};
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	cudaEvent_t cuda_event;
			
 
				+        cudaEvent_t cuda_event;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         cl_event opencl_event;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        struct _starpu_mpi_ms_async_event mpi_ms_event;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-	struct _starpu_mic_async_event mic_event;
			
 
				+        struct _starpu_mic_async_event mic_event;
			
 
				 #endif
			
 
				-	struct _starpu_disk_async_event disk_event;
			
 
				+        struct _starpu_disk_async_event disk_event;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_async_channel
			
 
				 {
			
 
				 	union _starpu_async_channel_event event;
			
 
				 	enum starpu_node_kind type;
			
 
				+        /* Which node to polling when needing ACK msg */
			
 
				+        struct _starpu_mp_node *polling_node_sender;
			
 
				+        struct _starpu_mp_node *polling_node_receiver;
			
 
				+        /* Used to know if the acknowlegdment msg is arrived from sinks */
			
 
				+        volatile int starpu_mp_common_finished_sender; 
			
 
				+        volatile int starpu_mp_common_finished_receiver; 
			
 
				 };
			
 
				 
			
 
				 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -152,6 +153,13 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	r->dst_replicate = dst_replicate;
			
 
				 	r->mode = mode;
			
 
				 	r->async_channel.type = STARPU_UNUSED;
			
 
				+        r->async_channel.starpu_mp_common_finished_sender = 0;
			
 
				+        r->async_channel.starpu_mp_common_finished_receiver = 0;
			
 
				+        r->async_channel.polling_node_sender = NULL;
			
 
				+        r->async_channel.polling_node_receiver = NULL;
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        r->async_channel.event.mpi_ms_event.requests = NULL;
			
 
				+#endif
			
 
				 	if (handling_node == -1)
			
 
				 		handling_node = STARPU_MAIN_RAM;
			
 
				 	r->handling_node = handling_node;
			
@@ -206,9 +214,9 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 
				 	int do_delete = 0;
			
 
				 	int completed;
			
 
				 
			
 
				+#ifdef STARPU_SIMGRID
			
 
				 	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				 
			
 
				-#ifdef STARPU_SIMGRID
			
 
				 	starpu_pthread_wait_t wait;
			
 
				 
			
 
				 	starpu_pthread_wait_init(&wait);
			
@@ -244,7 +252,7 @@ int _starpu_wait_data_request_completion(struct _starpu_data_request *r, unsigne
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-		_starpu_datawizard_progress(local_node, may_alloc);
			
 
				+		_starpu_datawizard_progress(may_alloc);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		starpu_pthread_wait_wait(&wait);
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -19,13 +19,14 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <datawizard/memalloc.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <core/progress_hook.h>
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				-int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
			
 
				+int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				 
			
@@ -63,7 +64,23 @@ int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsig
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
			
 
				+int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
			
 
				 {
			
 
				-	__starpu_datawizard_progress(memory_node, may_alloc, 1);
			
 
				+        int current_worker_id = starpu_worker_get_id();
			
 
				+        unsigned memnode;
			
 
				+
			
 
				+        int ret = 0;
			
 
				+
			
 
				+        for (memnode = 0; memnode < STARPU_MAXNODES; memnode++)
			
 
				+        {
			
 
				+                if (_starpu_worker_drives_memory[current_worker_id][memnode] == 1)
			
 
				+                        ret |= ___starpu_datawizard_progress(memnode, may_alloc, push_requests);
			
 
				+        }
			
 
				+
			
 
				+        return ret;
			
 
				+}
			
 
				+
			
 
				+void _starpu_datawizard_progress(unsigned may_alloc)
			
 
				+{
			
 
				+        __starpu_datawizard_progress(may_alloc, 1);
			
 
				 }
			
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -33,7 +33,8 @@
 
				 
			
 
				 #include <core/dependencies/implicit_data_deps.h>
			
 
				 
			
 
				-int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
			
 
				-void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc);
			
 
				+int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
			
 
				+int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
			
 
				+void _starpu_datawizard_progress(unsigned may_alloc);
			
 
				 
			
 
				 #endif // __DATAWIZARD_H__
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -592,6 +592,12 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
				 				addr = 0;
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+		case STARPU_MPI_MS_RAM:
			
 
				+			if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
			
 
				+				addr = 0;
			
 
				+			break;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_SCC
			
 
				 		case STARPU_SCC_RAM:
			
 
				 			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
			
@@ -693,6 +699,11 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 
				 			_starpu_mic_free_memory((void*) addr, size, dst_node);
			
 
				 			break;
			
 
				 #endif
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        case STARPU_MPI_MS_RAM:
			
 
				+            _starpu_mpi_source_free_memory((void*) addr, dst_node);
			
 
				+            break;
			
 
				+#endif
			
 
				 #ifdef STARPU_USE_SCC
			
 
				 		case STARPU_SCC_RAM:
			
 
				 			_starpu_scc_free_memory((void *) addr, dst_node);
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1367,7 +1367,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 	{
			
 
				 		cpt++;
			
 
				-		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+		_starpu_datawizard_progress(0);
			
 
				 	}
			
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -25,6 +25,8 @@
 
				 #include "copy_driver.h"
			
 
				 #include "memalloc.h"
			
 
				 
			
 
				+char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
			
 
				+
			
 
				 struct _starpu_memory_node_descr _starpu_descr;
			
 
				 starpu_pthread_key_t _starpu_memory_node_key STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
@@ -92,6 +94,9 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 
				 	case STARPU_MIC_RAM:
			
 
				 		prefix = "MIC";
			
 
				 		break;
			
 
				+	case STARPU_MPI_MS_RAM:
			
 
				+		prefix = "MPI_MS";
			
 
				+		break;
			
 
				 	case STARPU_SCC_RAM:
			
 
				 		prefix = "SCC_RAM";
			
 
				 		break;
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -33,6 +33,8 @@
 
				 #define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
			
 
				 #define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
			
 
				 
			
 
				+extern char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
			
 
				+
			
 
				 struct _starpu_cond_and_mutex
			
 
				 {
			
 
				         starpu_pthread_cond_t *cond;
			
@@ -96,6 +98,12 @@ static inline void _starpu_memory_node_add_nworkers(unsigned node)
 
				 	_starpu_descr.nworkers[node]++;
			
 
				 }
			
 
				 
			
 
				+/* same utility as _starpu_memory_node_add_nworkers */
			
 
				+static inline void _starpu_worker_drives_memory_node(unsigned worker_id, unsigned memnode)
			
 
				+{
			
 
				+    _starpu_worker_drives_memory[worker_id][memnode] = 1;   
			
 
				+}
			
 
				+
			
 
				 static inline unsigned _starpu_memory_node_get_nworkers(unsigned node)
			
 
				 {
			
 
				 	return _starpu_descr.nworkers[node];
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -50,7 +50,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 				while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				 				{
			
 
				 					cpt++;
			
 
				-					_starpu_datawizard_progress(requesting_node, 1);
			
 
				+					__starpu_datawizard_progress(1, 1);
			
 
				 				}
			
 
				 				if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 					_starpu_spin_lock(&handle->header_lock);
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -33,6 +33,7 @@
 
				 #define CUDA_WORKER_COLORS_NB	9
			
 
				 #define OPENCL_WORKER_COLORS_NB 9
			
 
				 #define MIC_WORKER_COLORS_NB	9
			
 
				+#define MPI_MS_WORKER_COLORS_NB	9
			
 
				 #define SCC_WORKER_COLORS_NB	9
			
 
				 #define OTHER_WORKER_COLORS_NB	4
			
 
				 
			
@@ -40,6 +41,7 @@ static char *cpus_worker_colors[CPUS_WORKER_COLORS_NB] = {"/greens9/7", "/greens
 
				 static char *cuda_worker_colors[CUDA_WORKER_COLORS_NB] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
			
 
				 static char *opencl_worker_colors[OPENCL_WORKER_COLORS_NB] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
			
 
				 static char *mic_worker_colors[MIC_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
			
 
				+static char *mpi_ms_worker_colors[MPI_MS_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
			
 
				 static char *scc_worker_colors[SCC_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
			
 
				 static char *other_worker_colors[OTHER_WORKER_COLORS_NB] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
			
 
				 static char *worker_colors[STARPU_NMAXWORKERS];
			
@@ -48,6 +50,7 @@ static unsigned opencl_index = 0;
 
				 static unsigned cuda_index = 0;
			
 
				 static unsigned cpus_index = 0;
			
 
				 static unsigned mic_index = 0;
			
 
				+static unsigned mpi_ms_index = 0;
			
 
				 static unsigned scc_index = 0;
			
 
				 static unsigned other_index = 0;
			
 
				 
			
@@ -248,6 +251,14 @@ static void set_next_mic_worker_color(int workerid)
 
				 	if (mic_index == MIC_WORKER_COLORS_NB) mic_index = 0;
			
 
				 }
			
 
				 
			
 
				+static void set_next_mpi_ms_worker_color(int workerid)
			
 
				+{
			
 
				+	if (workerid >= STARPU_NMAXWORKERS)
			
 
				+		return;
			
 
				+	worker_colors[workerid] = mpi_ms_worker_colors[mpi_ms_index++];
			
 
				+	if (mpi_ms_index == MPI_MS_WORKER_COLORS_NB) mpi_ms_index = 0;
			
 
				+}
			
 
				+
			
 
				 static void set_next_scc_worker_color(int workerid)
			
 
				 {
			
 
				 	if (workerid >= STARPU_NMAXWORKERS)
			
@@ -907,6 +918,14 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 			arch.devices[0].devid = devid;
			
 
				 			arch.devices[0].ncores = 1;
			
 
				 			break;
			
 
				+		case _STARPU_FUT_MPI_KEY:
			
 
				+			set_next_mpi_ms_worker_color(workerid);
			
 
				+			kindstr = "mpi_ms";
			
 
				+			arch.devices[0].type = STARPU_MPI_WORKER;
			
 
				+			arch.devices[0].devid = devid;
			
 
				+			arch.devices[0].ncores = 1;
			
 
				+			break;
			
 
				+			
			
 
				 		case _STARPU_FUT_SCC_KEY:
			
 
				 			set_next_scc_worker_color(workerid);
			
 
				 			kindstr = "scc";
			
@@ -1522,7 +1541,7 @@ static void handle_hypervisor_end(struct fxt_ev_64 *ev, struct starpu_fxt_option
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
			
 
				+static void handle_worker_status_on_tid(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
			
 
				 {
			
 
				 	int worker;
			
 
				 	worker = find_worker_id(ev->param[1]);
			
@@ -1535,6 +1554,19 @@ static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				 		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], newstatus, "Runtime");
			
 
				 }
			
 
				 
			
 
				+static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
			
 
				+{
			
 
				+	int worker;
			
 
				+	worker = ev->param[1];
			
 
				+	if (worker < 0)
			
 
				+		return;
			
 
				+
			
 
				+	if (out_paje_file)
			
 
				+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], newstatus);
			
 
				+	if (trace_file)
			
 
				+		recfmt_worker_set_state(get_event_time_stamp(ev, options), ev->param[1], newstatus, "Runtime");
			
 
				+}
			
 
				+
			
 
				 static double last_sleep_start[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 static void handle_worker_scheduling_start(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
@@ -2618,22 +2650,30 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 				break;
			
 
				 
			
 
				 			/* check the memory transfer overhead */
			
 
				-			case _STARPU_FUT_START_FETCH_INPUT:
			
 
				-				handle_worker_status(&ev, options, "Fi");
			
 
				+			case _STARPU_FUT_START_FETCH_INPUT_ON_TID:
			
 
				+				handle_worker_status_on_tid(&ev, options, "Fi");
			
 
				 				break;
			
 
				-			case _STARPU_FUT_START_PUSH_OUTPUT:
			
 
				-				handle_worker_status(&ev, options, "Po");
			
 
				+			case _STARPU_FUT_START_PUSH_OUTPUT_ON_TID:
			
 
				+				handle_worker_status_on_tid(&ev, options, "Po");
			
 
				 				break;
			
 
				-			case _STARPU_FUT_START_PROGRESS:
			
 
				-				handle_worker_status(&ev, options, "P");
			
 
				+			case _STARPU_FUT_START_PROGRESS_ON_TID:
			
 
				+				handle_worker_status_on_tid(&ev, options, "P");
			
 
				 				break;
			
 
				-			case _STARPU_FUT_START_UNPARTITION:
			
 
				-				handle_worker_status(&ev, options, "U");
			
 
				+			case _STARPU_FUT_START_UNPARTITION_ON_TID:
			
 
				+				handle_worker_status_on_tid(&ev, options, "U");
			
 
				 				break;
			
 
				+			case _STARPU_FUT_END_FETCH_INPUT_ON_TID:
			
 
				+			case _STARPU_FUT_END_PROGRESS_ON_TID:
			
 
				+			case _STARPU_FUT_END_PUSH_OUTPUT_ON_TID:
			
 
				+			case _STARPU_FUT_END_UNPARTITION_ON_TID:
			
 
				+				handle_worker_status_on_tid(&ev, options, "B");
			
 
				+				break;
			
 
				+
			
 
				+			case _STARPU_FUT_START_FETCH_INPUT:
			
 
				+				handle_worker_status(&ev, options, "Fi");
			
 
				+				break;
			
 
				+
			
 
				 			case _STARPU_FUT_END_FETCH_INPUT:
			
 
				-			case _STARPU_FUT_END_PROGRESS:
			
 
				-			case _STARPU_FUT_END_PUSH_OUTPUT:
			
 
				-			case _STARPU_FUT_END_UNPARTITION:
			
 
				 				handle_worker_status(&ev, options, "B");
			
 
				 				break;
			
 
				 
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -241,9 +241,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 #endif
			
 
				 
			
 
				 	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				-	res = __starpu_datawizard_progress(memnode, 1, 1);
			
 
				-	if (memnode != STARPU_MAIN_RAM)
			
 
				-		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
			
 
				+	res = __starpu_datawizard_progress(1, 1);
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				 	struct _starpu_job *j;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -810,16 +810,14 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 	if (!idle)
			
 
				 	{
			
 
				 		/* Nothing ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(memnode, 1, 0);
			
 
				-		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
			
 
				+		__starpu_datawizard_progress(1, 0);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	/* Something done, make some progress */
			
 
				 	res = !idle;
			
 
				-	res |= __starpu_datawizard_progress(memnode, 1, 1);
			
 
				-	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
			
 
				+	res |= __starpu_datawizard_progress(1, 1);
			
 
				 
			
 
				 	/* And pull tasks */
			
 
				 	res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -489,6 +489,11 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 		{
			
 
				 			tasks[i] = NULL;
			
 
				 		}
			
 
				+                /* don't push a task if we are already pushing one */
			
 
				+                else if (workers[i].task_sending != NULL)
			
 
				+                {
			
 
				+                        tasks[i] = NULL;
			
 
				+                }
			
 
				 		/*else try to pop a task*/
			
 
				 		else
			
 
				 		{
			
--- a/src/drivers/mic/driver_mic_common.c
+++ b/src/drivers/mic/driver_mic_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,7 +31,7 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, co
 
				  * care about it.
			
 
				  */
			
 
				 
			
 
				-void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event)
			
 
				 {
			
 
				   if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
			
@@ -56,7 +56,7 @@ int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
 
				  * care about it.
			
 
				  */
			
 
				 
			
 
				-void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
			
 
				 {
			
 
				 	if ((scif_recv(node->mp_connection.mic_endpoint, msg, len, SCIF_RECV_BLOCK)) < 0)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
			
@@ -65,7 +65,7 @@ void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
				 /* Handles the error so the caller (which must be generic) doesn't have to
			
 
				  * care about it.
			
 
				  */
			
 
				-void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
			
 
				+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len, void * event)
			
 
				 {
			
 
				 	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
			
@@ -74,7 +74,7 @@ void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg
 
				 /* Handles the error so the caller (which must be generic) doesn't have to
			
 
				  * care about it.
			
 
				  */
			
 
				-void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len)
			
 
				+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len, void * event)
			
 
				 {
			
 
				 	if ((scif_recv(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
			
--- a/src/drivers/mic/driver_mic_common.h
+++ b/src/drivers/mic/driver_mic_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -56,13 +56,13 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, in
 
				 
			
 
				 int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
			
 
				 
			
 
				-void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				 
			
 
				-void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				 
			
 
				-void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				 
			
 
				-void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				 
			
 
				 void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, COIPROCESS process,
			
 
				 				uint16_t local_port_number, uint16_t remote_port_number);
			
--- a/src/drivers/mic/driver_mic_source.c
+++ b/src/drivers/mic/driver_mic_source.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -407,7 +407,7 @@ int _starpu_mic_copy_ram_to_mic(void *src, unsigned src_node STARPU_ATTRIBUTE_UN
 
				 {
			
 
				 	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(dst_node);
			
 
				 
			
 
				-	return _starpu_src_common_copy_host_to_sink(mp_node, src, dst, size);
			
 
				+	return _starpu_src_common_copy_host_to_sink_sync(mp_node, src, dst, size);
			
 
				 }
			
 
				 
			
 
				 /* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
@@ -417,7 +417,7 @@ int _starpu_mic_copy_mic_to_ram(void *src, unsigned src_node, void *dst, unsigne
 
				 {
			
 
				 	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(src_node);
			
 
				 
			
 
				-	return _starpu_src_common_copy_sink_to_host(mp_node, src, dst, size);
			
 
				+	return _starpu_src_common_copy_sink_to_host_sync(mp_node, src, dst, size);
			
 
				 }
			
 
				 
			
 
				 /* Asynchronous transfers */
			
--- a/src/drivers/mp_common/mp_common.c
+++ b/src/drivers/mp_common/mp_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,6 +26,9 @@
 
				 #include <drivers/scc/driver_scc_common.h>
			
 
				 #include <drivers/scc/driver_scc_source.h>
			
 
				 #include <drivers/scc/driver_scc_sink.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_sink.h>
			
 
				 
			
 
				 #include <common/list.h>
			
 
				 
			
@@ -159,6 +162,8 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 
				 		node->dt_send = _starpu_mic_common_dt_send;
			
 
				 		node->dt_recv = _starpu_mic_common_dt_recv;
			
 
				 
			
 
				+                node->dt_test = NULL; /* Not used now */
			
 
				+
			
 
				 		node->get_kernel_from_job = NULL;
			
 
				 		node->lookup = _starpu_mic_sink_lookup;
			
 
				 		node->bind_thread = _starpu_mic_sink_bind_thread;
			
@@ -209,6 +214,8 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 
				 		node->dt_send_to_device = _starpu_scc_sink_send_to_device;
			
 
				 		node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
			
 
				 
			
 
				+                node->dt_test = NULL /* not used now */
			
 
				+
			
 
				 		node->get_kernel_from_job = NULL;
			
 
				 		node->lookup = _starpu_scc_sink_lookup;
			
 
				 		node->bind_thread = _starpu_scc_sink_bind_thread;
			
@@ -219,15 +226,72 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 
				 	break;
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 	case STARPU_NODE_MPI_SOURCE:
			
 
				-		STARPU_ABORT();
			
 
				+        {
			
 
				+                /*
			
 
				+                   node->nb_mp_sinks = 
			
 
				+                   node->devid = 
			
 
				+                   */
			
 
				+                node->peer_id = (_starpu_mpi_common_get_src_node() <= peer_id ? peer_id+1 : peer_id);
			
 
				+                node->mp_connection.mpi_remote_nodeid = node->peer_id;
			
 
				+
			
 
				+                node->init = _starpu_mpi_source_init;
			
 
				+                node->launch_workers = NULL;
			
 
				+                node->deinit = _starpu_mpi_source_deinit;
			
 
				+                /*     node->report_error = */
			
 
				+
			
 
				+                node->mp_recv_is_ready = _starpu_mpi_common_recv_is_ready;
			
 
				+                node->mp_send = _starpu_mpi_common_mp_send;
			
 
				+                node->mp_recv = _starpu_mpi_common_mp_recv;
			
 
				+                node->dt_send = _starpu_mpi_common_send;
			
 
				+                node->dt_recv = _starpu_mpi_common_recv;
			
 
				+                node->dt_send_to_device = _starpu_mpi_common_send_to_device;
			
 
				+                node->dt_recv_from_device = _starpu_mpi_common_recv_from_device;
			
 
				+
			
 
				+                node->get_kernel_from_job = _starpu_mpi_ms_src_get_kernel_from_job;
			
 
				+                node->lookup = NULL;
			
 
				+                node->bind_thread = NULL;
			
 
				+                node->execute = NULL;
			
 
				+                node->allocate = NULL;
			
 
				+                node->free = NULL;
			
 
				+        }
			
 
				+        break;
			
 
				+
			
 
				+        case STARPU_NODE_MPI_SINK:
			
 
				+        {
			
 
				+                /*
			
 
				+                   node->nb_mp_sinks = 
			
 
				+                   node->devid = 
			
 
				+                   */
			
 
				+                node->mp_connection.mpi_remote_nodeid = _starpu_mpi_common_get_src_node();
			
 
				+
			
 
				+                node->init = _starpu_mpi_sink_init;
			
 
				+                node->launch_workers = _starpu_mpi_sink_launch_workers;
			
 
				+                node->deinit = _starpu_mpi_sink_deinit;
			
 
				+                /*    node->report_error =  */
			
 
				+
			
 
				+                node->mp_recv_is_ready = _starpu_mpi_common_recv_is_ready;
			
 
				+                node->mp_send = _starpu_mpi_common_mp_send;
			
 
				+                node->mp_recv = _starpu_mpi_common_mp_recv;
			
 
				+                node->dt_send = _starpu_mpi_common_send;
			
 
				+                node->dt_recv = _starpu_mpi_common_recv;
			
 
				+                node->dt_send_to_device = _starpu_mpi_common_send_to_device;
			
 
				+                node->dt_recv_from_device = _starpu_mpi_common_recv_from_device;
			
 
				+
			
 
				+                node->dt_test = _starpu_mpi_common_test_event;
			
 
				+
			
 
				+                node->get_kernel_from_job = NULL;
			
 
				+                node->lookup = _starpu_mpi_sink_lookup;
			
 
				+                node->bind_thread = _starpu_mpi_sink_bind_thread;
			
 
				+                node->execute = _starpu_sink_common_execute;
			
 
				+                node->allocate = _starpu_sink_common_allocate;
			
 
				+                node->free = _starpu_sink_common_free;
			
 
				+
			
 
				+
			
 
				+        }
			
 
				 		break;
			
 
				-
			
 
				-	case STARPU_NODE_MPI_SINK:
			
 
				-		STARPU_ABORT();
			
 
				-		break;
			
 
				-#endif /* STARPU_USE_MPI */
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				 
			
 
				 	default:
			
 
				 		STARPU_ASSERT(0);
			
@@ -243,8 +307,12 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 
				 	mp_message_list_init(&node->message_queue);
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&node->message_queue_mutex,NULL);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_INIT(&node->connection_mutex, NULL);
			
 
				+
			
 
				+        _starpu_mp_event_list_init(&node->event_list);
			
 
				+
			
 
				 	/* If the node is a sink then we must initialize some field */
			
 
				-	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK)
			
 
				+	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK || node->kind == STARPU_NODE_MPI_SINK)
			
 
				 	{
			
 
				 		int i;
			
 
				 		node->is_running = 1;
			
@@ -258,7 +326,6 @@ _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
 
				 		}
			
 
				 		mp_barrier_list_init(&node->barrier_list);
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&node->barrier_mutex,NULL);
			
 
				-
			
 
				 		STARPU_PTHREAD_BARRIER_INIT(&node->init_completed_barrier, NULL, node->nb_cores+1);
			
 
				 
			
 
				 		node->launch_workers(node);
			
@@ -276,7 +343,7 @@ void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 
				 	STARPU_PTHREAD_MUTEX_DESTROY(&node->message_queue_mutex);
			
 
				 
			
 
				 	/* If the node is a sink then we must destroy some field */
			
 
				-	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK)
			
 
				+	if(node->kind == STARPU_NODE_MIC_SINK || node->kind == STARPU_NODE_SCC_SINK || node->kind == STARPU_NODE_MPI_SINK)
			
 
				 	{
			
 
				 		int i;
			
 
				 		for(i=0; i<node->nb_cores; i++)
			
@@ -303,6 +370,8 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 
				 {
			
 
				 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
			
 
				 
			
 
				+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
			
 
				+
			
 
				 	/* MIC and MPI sizes are given through a int */
			
 
				 	int command_size = sizeof(enum _starpu_mp_command);
			
 
				 	int arg_size_size = sizeof(int);
			
@@ -337,6 +406,8 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 
				 	command = *((enum _starpu_mp_command *) node->buffer);
			
 
				 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
			
 
				 
			
 
				+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
			
 
				+
			
 
				 	/* If there is no argument (ie. arg_size == 0),
			
 
				 	 * let's return the command right now */
			
 
				 	if (!(*arg_size))
			
--- a/src/drivers/mp_common/mp_common.h
+++ b/src/drivers/mp_common/mp_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,6 +26,7 @@
 
				 #include <common/barrier.h>
			
 
				 #include <common/thread.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				+#include <datawizard/copy_driver.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_MP
			
 
				 
			
@@ -52,10 +53,21 @@ enum _starpu_mp_command
 
				 	STARPU_MP_COMMAND_ANSWER_ALLOCATE,
			
 
				 	STARPU_MP_COMMAND_ERROR_ALLOCATE,
			
 
				 	STARPU_MP_COMMAND_FREE,
			
 
				+        /* Synchronous send */
			
 
				 	STARPU_MP_COMMAND_RECV_FROM_HOST,
			
 
				 	STARPU_MP_COMMAND_SEND_TO_HOST,
			
 
				 	STARPU_MP_COMMAND_RECV_FROM_SINK,
			
 
				 	STARPU_MP_COMMAND_SEND_TO_SINK,
			
 
				+        /* Asynchronous send */
			
 
				+        STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC,
			
 
				+        STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED,
			
 
				+	STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC,
			
 
				+	STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED,
			
 
				+	STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC,
			
 
				+	STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED,
			
 
				+	STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC,
			
 
				+	STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED,
			
 
				+
			
 
				 	STARPU_MP_COMMAND_TRANSFER_COMPLETE,
			
 
				 	STARPU_MP_COMMAND_SINK_NBCORES,
			
 
				 	STARPU_MP_COMMAND_ANSWER_SINK_NBCORES,
			
@@ -88,13 +100,16 @@ union _starpu_mp_connection
 
				 #ifdef STARPU_USE_SCC
			
 
				 	int scc_nodeid;
			
 
				 #endif
			
 
				-	int mpi_nodeid;
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+	int mpi_remote_nodeid;
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				 struct _starpu_mp_transfer_command
			
 
				 {
			
 
				 	size_t size;
			
 
				 	void *addr;
			
 
				+        void *event;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_mp_transfer_command_to_device
			
@@ -102,6 +117,7 @@ struct _starpu_mp_transfer_command_to_device
 
				 	int devid;
			
 
				 	size_t size;
			
 
				 	void *addr;
			
 
				+        void *event;
			
 
				 };
			
 
				 
			
 
				 LIST_TYPE(mp_barrier,
			
@@ -129,6 +145,12 @@ struct mp_task
 
				  	struct mp_barrier* mp_barrier;
			
 
				 };
			
 
				 
			
 
				+LIST_TYPE(_starpu_mp_event,
			
 
				+                struct _starpu_async_channel event;
			
 
				+                void * remote_event;
			
 
				+                enum _starpu_mp_command answer_cmd;
			
 
				+);
			
 
				+
			
 
				 
			
 
				 /* Message-passing working node, whether source
			
 
				  * or sink */
			
@@ -167,61 +189,74 @@ struct _starpu_mp_node
 
				 	 * sink it controls */
			
 
				 	union _starpu_mp_connection mp_connection;
			
 
				 
			
 
				-	/* Only MIC use this for now !!
			
 
				-	 * Connection used for data transfers between the host and his sink. */
			
 
				-	union _starpu_mp_connection host_sink_dt_connection;
			
 
				+        /* Only MIC use this for now !!
			
 
				+         * Connection used for data transfers between the host and his sink. */
			
 
				+        union _starpu_mp_connection host_sink_dt_connection;
			
 
				 
			
 
				-	/* Only MIC use this for now !!
			
 
				-	 * Only sink use this for now !!
			
 
				-	 * Connection used for data transfer between devices.
			
 
				-	 * A sink opens a connection with each other sink,
			
 
				-	 * thus each sink can directly send data to each other.
			
 
				-	 * For sink :
			
 
				-	 *  - sink_sink_dt_connections[i] is the connection to the sink number i.
			
 
				-	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
			
 
				-	union _starpu_mp_connection *sink_sink_dt_connections;
			
 
				-
			
 
				-	/* */
			
 
				-	starpu_pthread_barrier_t init_completed_barrier; 
			
 
				-	
			
 
				-	/* table to store pointer of the thread workers*/
			
 
				-	void* thread_table;
			
 
				+        /* Mutex to protect the interleaving of communications when using one thread per node,
			
 
				+         * for instance, when a thread transfers piece of data and an other wants to use
			
 
				+         * a sink_to_sink communication */
			
 
				+        starpu_pthread_mutex_t connection_mutex;
			
 
				+
			
 
				+        /* Only MIC use this for now !!
			
 
				+         * Only sink use this for now !!
			
 
				+         * Connection used for data transfer between devices.
			
 
				+         * A sink opens a connection with each other sink,
			
 
				+         * thus each sink can directly send data to each other.
			
 
				+         * For sink :
			
 
				+         *  - sink_sink_dt_connections[i] is the connection to the sink number i.
			
 
				+         *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
			
 
				+        union _starpu_mp_connection *sink_sink_dt_connections;
			
 
				+
			
 
				+        /* This list contains events
			
 
				+         * about asynchronous request
			
 
				+         */
			
 
				+        struct _starpu_mp_event_list event_list;
			
 
				+
			
 
				+        /* */
			
 
				+        starpu_pthread_barrier_t init_completed_barrier; 
			
 
				+
			
 
				+        /* table to store pointer of the thread workers*/
			
 
				+        void* thread_table;
			
 
				 
			
 
				         /*list where threads add messages to send to the source node */
			
 
				         struct mp_message_list message_queue;
			
 
				-	starpu_pthread_mutex_t message_queue_mutex;
			
 
				-
			
 
				-	/*list of barrier for combined worker*/
			
 
				-	struct mp_barrier_list barrier_list;
			
 
				-	starpu_pthread_mutex_t barrier_mutex;
			
 
				-
			
 
				-	/*table where worker comme pick task*/
			
 
				-	struct mp_task ** run_table;
			
 
				-	sem_t * sem_run_table;
			
 
				-
			
 
				-	/* Node general functions */
			
 
				-	void (*init)(struct _starpu_mp_node *node);
			
 
				-	void (*launch_workers)(struct _starpu_mp_node *node);
			
 
				-	void (*deinit)(struct _starpu_mp_node *node);
			
 
				-	void (*report_error)(const char *, const char *, const int, const int);
			
 
				-
			
 
				-	/* Message passing */
			
 
				-	int (*mp_recv_is_ready)(const struct _starpu_mp_node *);
			
 
				-	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
			
 
				-	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
			
 
				-
			
 
				-	/* Data transfers */
			
 
				-	void (*dt_send)(const struct _starpu_mp_node *, void *, int);
			
 
				-	void (*dt_recv)(const struct _starpu_mp_node *, void *, int);
			
 
				-	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
			
 
				-	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
			
 
				-
			
 
				-	void (*(*get_kernel_from_job)(const struct _starpu_mp_node *,struct _starpu_job *))(void);
			
 
				-	void (*(*lookup)(const struct _starpu_mp_node *, char* ))(void);
			
 
				-	void (*bind_thread)(const struct _starpu_mp_node *, int,int *,int);
			
 
				-	void (*execute)(struct _starpu_mp_node *, void *, int);
			
 
				-	void (*allocate)(const struct _starpu_mp_node *, void *, int);
			
 
				-	void (*free)(const struct _starpu_mp_node *, void *, int);
			
 
				+        starpu_pthread_mutex_t message_queue_mutex;
			
 
				+
			
 
				+        /*list of barrier for combined worker*/
			
 
				+        struct mp_barrier_list barrier_list;
			
 
				+        starpu_pthread_mutex_t barrier_mutex;
			
 
				+
			
 
				+        /*table where worker comme pick task*/
			
 
				+        struct mp_task ** run_table;
			
 
				+        sem_t * sem_run_table;
			
 
				+
			
 
				+        /* Node general functions */
			
 
				+        void (*init)            (struct _starpu_mp_node *node);
			
 
				+        void (*launch_workers)  (struct _starpu_mp_node *node);
			
 
				+        void (*deinit)          (struct _starpu_mp_node *node);
			
 
				+        void (*report_error)    (const char *, const char *, const int, const int);
			
 
				+
			
 
				+        /* Message passing */
			
 
				+        int (*mp_recv_is_ready) (const struct _starpu_mp_node *);
			
 
				+        void (*mp_send)         (const struct _starpu_mp_node *, void *, int);
			
 
				+        void (*mp_recv)         (const struct _starpu_mp_node *, void *, int);
			
 
				+
			
 
				+        /* Data transfers */
			
 
				+        void (*dt_send)             (const struct _starpu_mp_node *, void *, int, void *);
			
 
				+        void (*dt_recv)             (const struct _starpu_mp_node *, void *, int, void *);
			
 
				+        void (*dt_send_to_device)   (const struct _starpu_mp_node *, int, void *, int, void *);
			
 
				+        void (*dt_recv_from_device) (const struct _starpu_mp_node *, int, void *, int, void *);
			
 
				+
			
 
				+        /* Test async transfers */
			
 
				+        int (*dt_test) (struct _starpu_async_channel *);
			
 
				+
			
 
				+        void (*(*get_kernel_from_job)   (const struct _starpu_mp_node *,struct _starpu_job *))(void);
			
 
				+        void (*(*lookup)                (const struct _starpu_mp_node *, char* ))(void);
			
 
				+        void (*bind_thread)             (const struct _starpu_mp_node *, int,int *,int);
			
 
				+        void (*execute)                 (struct _starpu_mp_node *, void *, int);
			
 
				+        void (*allocate)                (const struct _starpu_mp_node *, void *, int);
			
 
				+        void (*free)                    (const struct _starpu_mp_node *, void *, int);
			
 
				 };
			
 
				 
			
 
				 struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid) STARPU_ATTRIBUTE_MALLOC;
			
--- a/src/drivers/mp_common/sink_common.c
+++ b/src/drivers/mp_common/sink_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,7 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <common/barrier.h>
			
 
				 #include <core/workers.h>
			
@@ -29,7 +30,6 @@
 
				 
			
 
				 #include "sink_common.h"
			
 
				 
			
 
				-
			
 
				 /* Return the sink kind of the running process, based on the value of the
			
 
				  * STARPU_SINK environment variable.
			
 
				  * If there is no valid value retrieved, return STARPU_INVALID_KIND
			
@@ -45,7 +45,7 @@ static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
 
				 		return STARPU_NODE_MIC_SINK;
			
 
				 	else if (!strcmp(node_kind, "STARPU_SCC"))
			
 
				 		return STARPU_NODE_SCC_SINK;
			
 
				-	else if (!strcmp(node_kind, "STARPU_MPI"))
			
 
				+	else if (!strcmp(node_kind, "STARPU_MPI_MS"))
			
 
				 		return STARPU_NODE_MPI_SINK;
			
 
				 	else
			
 
				 		return STARPU_NODE_INVALID_KIND;
			
@@ -108,46 +108,168 @@ void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRI
 
				 	free(*(void **)(arg));
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
			
 
				+static void _starpu_sink_common_copy_from_host_sync(const struct _starpu_mp_node *mp_node,
			
 
				+					       void *arg, int arg_size)
			
 
				+{
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				+
			
 
				+        struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				+
			
 
				+        mp_node->dt_recv(mp_node, cmd->addr, cmd->size, NULL);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void _starpu_sink_common_copy_from_host_async(struct _starpu_mp_node *mp_node,
			
 
				 					       void *arg, int arg_size)
			
 
				 {
			
 
				 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				 
			
 
				+        struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				+
			
 
				+        /* For asynchronous transfers, we store events to test them later when they are finished */
			
 
				+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
			
 
				+        /* Save the command to send */
			
 
				+        sink_event->answer_cmd = STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED;
			
 
				+        sink_event->remote_event = cmd->event;
			
 
				+
			
 
				+        /* Set the sender (host) ready because we don't want to wait its ack */
			
 
				+        struct _starpu_async_channel * async_channel = &sink_event->event;
			
 
				+        async_channel->type = STARPU_UNUSED;
			
 
				+        async_channel->starpu_mp_common_finished_sender = -1;
			
 
				+        async_channel->starpu_mp_common_finished_receiver = 0;
			
 
				+        async_channel->polling_node_receiver = NULL;
			
 
				+        async_channel->polling_node_sender = NULL;
			
 
				+
			
 
				+        mp_node->dt_recv(mp_node, cmd->addr, cmd->size, &sink_event->event);
			
 
				+        /* Push event on the list */
			
 
				+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void _starpu_sink_common_copy_to_host_sync(const struct _starpu_mp_node *mp_node,
			
 
				+					     void *arg, int arg_size)
			
 
				+{
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				+
			
 
				 	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				 
			
 
				-	mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
			
 
				+        /* Save values before sending command to prevent the overwriting */
			
 
				+        size_t size = cmd->size;
			
 
				+        void * addr = cmd->addr;
			
 
				+
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, NULL, 0);
			
 
				+
			
 
				+        mp_node->dt_send(mp_node, addr, size, NULL);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
			
 
				+
			
 
				+static void _starpu_sink_common_copy_to_host_async(struct _starpu_mp_node *mp_node,
			
 
				 					     void *arg, int arg_size)
			
 
				 {
			
 
				 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				 
			
 
				 	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				 
			
 
				-	mp_node->dt_send(mp_node, cmd->addr, cmd->size);
			
 
				+        /* For asynchronous transfers, we need to say dt_send that we are in async mode 
			
 
				+         * but we don't push event on list because we don't need to know if it's finished
			
 
				+         */
			
 
				+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
			
 
				+        /* Save the command to send */
			
 
				+        sink_event->answer_cmd = STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED;
			
 
				+        sink_event->remote_event = cmd->event;
			
 
				+
			
 
				+        /* Set the receiver (host) ready because we don't want to wait its ack */
			
 
				+        struct _starpu_async_channel * async_channel = &sink_event->event;
			
 
				+        async_channel->type = STARPU_UNUSED;
			
 
				+        async_channel->starpu_mp_common_finished_sender = 0;
			
 
				+        async_channel->starpu_mp_common_finished_receiver = -1;
			
 
				+        async_channel->polling_node_receiver = NULL;
			
 
				+        async_channel->polling_node_sender = NULL;
			
 
				+
			
 
				+        mp_node->dt_send(mp_node, cmd->addr, cmd->size, &sink_event->event);
			
 
				+        /* Push event on the list */
			
 
				+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
			
 
				+
			
 
				+static void _starpu_sink_common_copy_from_sink_sync(const struct _starpu_mp_node *mp_node,
			
 
				 					       void *arg, int arg_size)
			
 
				 {
			
 
				 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				 
			
 
				 	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				 
			
 
				-	mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				+        mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size, NULL);
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_TRANSFER_COMPLETE, NULL, 0);
			
 
				+}
			
 
				 
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_TRANSFER_COMPLETE, NULL, 0);
			
 
				+
			
 
				+static void _starpu_sink_common_copy_from_sink_async(struct _starpu_mp_node *mp_node,
			
 
				+					       void *arg, int arg_size)
			
 
				+{
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				+
			
 
				+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				+
			
 
				+        /* For asynchronous transfers, we store events to test them later when they are finished
			
 
				+        */
			
 
				+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
			
 
				+        /* Save the command to send */
			
 
				+        sink_event->answer_cmd = STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED;
			
 
				+        sink_event->remote_event = cmd->event;
			
 
				+
			
 
				+        /* Set the sender ready because we don't want to wait its ack */
			
 
				+        struct _starpu_async_channel * async_channel = &sink_event->event;
			
 
				+        async_channel->type = STARPU_UNUSED;
			
 
				+        async_channel->starpu_mp_common_finished_sender = -1;
			
 
				+        async_channel->starpu_mp_common_finished_receiver = 0;
			
 
				+        async_channel->polling_node_receiver = NULL;
			
 
				+        async_channel->polling_node_sender = NULL;
			
 
				+
			
 
				+        mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size, &sink_event->event);
			
 
				+        /* Push event on the list */
			
 
				+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void _starpu_sink_common_copy_to_sink_sync(const struct _starpu_mp_node *mp_node,
			
 
				+					     void *arg, int arg_size)
			
 
				+{
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				+
			
 
				+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				+
			
 
				+        mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size, NULL);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				+
			
 
				+static void _starpu_sink_common_copy_to_sink_async(struct _starpu_mp_node *mp_node,
			
 
				 					     void *arg, int arg_size)
			
 
				 {
			
 
				 	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				 
			
 
				 	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				 
			
 
				-	mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				+        /* For asynchronous transfers, we need to say dt_send that we are in async mode 
			
 
				+         * but we don't push event on list because we don't need to know if it's finished
			
 
				+         */
			
 
				+        struct _starpu_mp_event * sink_event = _starpu_mp_event_new();
			
 
				+        /* Save the command to send */
			
 
				+        sink_event->answer_cmd = STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED;
			
 
				+        sink_event->remote_event = cmd->event;
			
 
				+
			
 
				+        /* Set the receiver ready because we don't want to wait its ack */
			
 
				+        struct _starpu_async_channel * async_channel = &sink_event->event;
			
 
				+        async_channel->type = STARPU_UNUSED;
			
 
				+        async_channel->starpu_mp_common_finished_sender = 0;
			
 
				+        async_channel->starpu_mp_common_finished_receiver = -1;
			
 
				+        async_channel->polling_node_receiver = NULL;
			
 
				+        async_channel->polling_node_sender = NULL;
			
 
				+
			
 
				+        mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size, &sink_event->event);
			
 
				+
			
 
				+        /* Push event on the list */
			
 
				+        _starpu_mp_event_list_push_back(&mp_node->event_list, sink_event);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -178,7 +300,7 @@ static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void
 
				 
			
 
				 	/* Retrieve workers */
			
 
				 	struct _starpu_worker * workers = &config->workers[baseworkerid];
			
 
				-	node->dt_recv(node,workers,worker_size);
			
 
				+	node->dt_recv(node,workers,worker_size, NULL);
			
 
				 
			
 
				 	/* Update workers to have coherent field */
			
 
				 	for(i=0; i<nworkers; i++)
			
@@ -205,7 +327,7 @@ static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void
 
				 
			
 
				 	/* Retrieve combined workers */
			
 
				 	struct _starpu_combined_worker * combined_workers = config->combined_workers;
			
 
				-	node->dt_recv(node, combined_workers, combined_worker_size);
			
 
				+	node->dt_recv(node, combined_workers, combined_worker_size, NULL);
			
 
				 
			
 
				 	node->baseworkerid = baseworkerid;
			
 
				 	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);
			
@@ -267,21 +389,37 @@ void _starpu_sink_common_worker(void)
 
				 					break;
			
 
				 
			
 
				 				case STARPU_MP_COMMAND_RECV_FROM_HOST:
			
 
				-					_starpu_sink_common_copy_from_host(node, arg, arg_size);
			
 
				+					_starpu_sink_common_copy_from_host_sync(node, arg, arg_size);
			
 
				 					break;
			
 
				 
			
 
				 				case STARPU_MP_COMMAND_SEND_TO_HOST:
			
 
				-					_starpu_sink_common_copy_to_host(node, arg, arg_size);
			
 
				+					_starpu_sink_common_copy_to_host_sync(node, arg, arg_size);
			
 
				 					break;
			
 
				 
			
 
				 				case STARPU_MP_COMMAND_RECV_FROM_SINK:
			
 
				-					_starpu_sink_common_copy_from_sink(node, arg, arg_size);
			
 
				+					_starpu_sink_common_copy_from_sink_sync(node, arg, arg_size);
			
 
				 					break;
			
 
				 
			
 
				 				case STARPU_MP_COMMAND_SEND_TO_SINK:
			
 
				-					_starpu_sink_common_copy_to_sink(node, arg, arg_size);
			
 
				+					_starpu_sink_common_copy_to_sink_sync(node, arg, arg_size);
			
 
				 					break;
			
 
				 
			
 
				+                                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC:
			
 
				+                                        _starpu_sink_common_copy_from_host_async(node, arg, arg_size);
			
 
				+                                        break;
			
 
				+
			
 
				+                                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC:
			
 
				+                                        _starpu_sink_common_copy_to_host_async(node, arg, arg_size);
			
 
				+                                        break;
			
 
				+
			
 
				+                                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC:
			
 
				+                                        _starpu_sink_common_copy_from_sink_async(node, arg, arg_size);
			
 
				+                                        break;
			
 
				+
			
 
				+                                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC:
			
 
				+                                        _starpu_sink_common_copy_to_sink_async(node, arg, arg_size);
			
 
				+                                        break;
			
 
				+
			
 
				 				case STARPU_MP_COMMAND_SYNC_WORKERS:
			
 
				 					_starpu_sink_common_recv_workers(node, arg, arg_size);
			
 
				 					break;
			
@@ -307,6 +445,24 @@ void _starpu_sink_common_worker(void)
 
				 		{
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 		}
			
 
				+
			
 
				+        if(!_starpu_mp_event_list_empty(&node->event_list))
			
 
				+        {
			
 
				+            struct _starpu_mp_event * sink_event = _starpu_mp_event_list_pop_front(&node->event_list);
			
 
				+            if (node->dt_test(&sink_event->event))
			
 
				+            {
			
 
				+                /* send ACK to host */
			
 
				+                _starpu_mp_common_send_command(node, sink_event->answer_cmd , &sink_event->remote_event, sizeof(sink_event->remote_event));
			
 
				+                _starpu_mp_event_delete(sink_event);
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                /* try later */
			
 
				+                 _starpu_mp_event_list_push_back(&node->event_list, sink_event);
			
 
				+            }
			
 
				+            
			
 
				+
			
 
				+        }
			
 
				 	}
			
 
				 
			
 
				 	STARPU_PTHREAD_KEY_DELETE(worker_key);
			
@@ -314,6 +470,10 @@ void _starpu_sink_common_worker(void)
 
				 	/* Deinitialize the node and release it */
			
 
				 	_starpu_mp_common_node_destroy(node);
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+        _starpu_mpi_common_mp_deinit();
			
 
				+#endif
			
 
				+
			
 
				 	exit(0);
			
 
				 }
			
 
				 
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,9 +24,25 @@
 
				 
			
 
				 
			
 
				 #include <datawizard/coherency.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+struct starpu_save_thread_env
			
 
				+{
			
 
				+        struct starpu_task * current_task;
			
 
				+        struct _starpu_worker * current_worker;
			
 
				+        struct _starpu_worker_set * current_worker_set;
			
 
				+        unsigned * current_mem_node;
			
 
				+#ifdef STARPU_OPENMP
			
 
				+        struct starpu_omp_thread * current_omp_thread;
			
 
				+        struct starpu_omp_task * current_omp_task;
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct starpu_save_thread_env save_thread_env[STARPU_MAXMPIDEVS];
			
 
				+#endif
			
 
				 
			
 
				 /* Finalize the execution of a task by a worker*/
			
 
				 static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
			
@@ -67,7 +83,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 
				 
			
 
				 
			
 
				 /* Complete the execution of the job */
			
 
				-static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *workerset, void * arg, int arg_size)
			
 
				+static int _starpu_src_common_process_completed_job(struct _starpu_mp_node *node, struct _starpu_worker_set *workerset, void * arg, int arg_size, int stored)
			
 
				 {
			
 
				 	int coreid;
			
 
				 
			
@@ -80,6 +96,10 @@ static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *w
 
				 
			
 
				 	struct _starpu_worker * old_worker = _starpu_get_local_worker_key();
			
 
				 
			
 
				+        /* if arg is not copied we release the mutex */
			
 
				+        if (!stored)
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+
			
 
				 	_starpu_set_local_worker_key(worker);
			
 
				 	_starpu_src_common_finalize_job (j, worker);
			
 
				 	_starpu_set_local_worker_key(old_worker);
			
@@ -89,12 +109,17 @@ static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *w
 
				 }
			
 
				 
			
 
				 /* Tell the scheduler when the execution has begun */
			
 
				-static void _starpu_src_common_pre_exec(void * arg, int arg_size)
			
 
				+static void _starpu_src_common_pre_exec(struct _starpu_mp_node *node, void * arg, int arg_size, int stored)
			
 
				 {
			
 
				 	int cb_workerid, i;
			
 
				 	STARPU_ASSERT(sizeof(cb_workerid) == arg_size);
			
 
				 	cb_workerid = *(int *) arg;
			
 
				 	struct _starpu_combined_worker *combined_worker = _starpu_get_combined_worker_struct(cb_workerid);
			
 
				+
			
 
				+        /* if arg is not copied we release the mutex */
			
 
				+        if (!stored)
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+
			
 
				 	for(i=0; i < combined_worker->worker_size; i++)
			
 
				 	{
			
 
				 		struct _starpu_worker * worker = _starpu_get_worker_struct(combined_worker->combined_workerid[i]);
			
@@ -107,25 +132,43 @@ static void _starpu_src_common_pre_exec(void * arg, int arg_size)
 
				  * return 0 if the message has not been handle (it's certainly mean that it's a synchronous message)
			
 
				  * return 1 if the message has been handle
			
 
				  */
			
 
				-static int _starpu_src_common_handle_async(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED,
			
 
				+static int _starpu_src_common_handle_async(struct _starpu_mp_node *node,
			
 
				 		void * arg, int arg_size,
			
 
				-		enum _starpu_mp_command answer)
			
 
				+		enum _starpu_mp_command answer, int stored)
			
 
				 {
			
 
				-	struct _starpu_worker_set * worker_set=NULL;
			
 
				-	switch(answer)
			
 
				-	{
			
 
				-		case STARPU_MP_COMMAND_EXECUTION_COMPLETED:
			
 
				-			worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
			
 
				-			_starpu_src_common_process_completed_job(worker_set, arg, arg_size);
			
 
				-			break;
			
 
				-		case STARPU_MP_COMMAND_PRE_EXECUTION:
			
 
				-			_starpu_src_common_pre_exec(arg,arg_size);
			
 
				-			break;
			
 
				-		default:
			
 
				-			return 0;
			
 
				-			break;
			
 
				-	}
			
 
				-	return 1;
			
 
				+        struct _starpu_worker_set * worker_set = NULL;
			
 
				+        switch(answer)
			
 
				+        {
			
 
				+                case STARPU_MP_COMMAND_EXECUTION_COMPLETED:
			
 
				+                        worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
			
 
				+                        _starpu_src_common_process_completed_job(node, worker_set, arg, arg_size, stored);
			
 
				+                        break;
			
 
				+                case STARPU_MP_COMMAND_PRE_EXECUTION:
			
 
				+                        _starpu_src_common_pre_exec(node, arg,arg_size, stored);
			
 
				+                        break;
			
 
				+                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED:
			
 
				+                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED:
			
 
				+                        {
			
 
				+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
			
 
				+                                event->starpu_mp_common_finished_receiver--;
			
 
				+                                if (!stored)
			
 
				+                                        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+                                break;
			
 
				+                        }
			
 
				+                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED:
			
 
				+                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED:
			
 
				+                        {
			
 
				+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
			
 
				+                                event->starpu_mp_common_finished_sender--;
			
 
				+                                if (!stored)
			
 
				+                                        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+                                break;
			
 
				+                        }
			
 
				+                default:
			
 
				+                        return 0;
			
 
				+                        break;
			
 
				+        }
			
 
				+        return 1;
			
 
				 }
			
 
				 
			
 
				 /* Handle all message which have been stored in the message_queue */
			
@@ -137,10 +180,14 @@ static void _starpu_src_common_handle_stored_async(struct _starpu_mp_node *node)
 
				 	{
			
 
				 		/* We pop a message and handle it */
			
 
				 		struct mp_message * message = mp_message_list_pop_back(&node->message_queue);
			
 
				+                /* Release mutex during handle */
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 		_starpu_src_common_handle_async(node, message->buffer,
			
 
				-				message->size, message->type);
			
 
				+				message->size, message->type, 1);
			
 
				 		free(message->buffer);
			
 
				 		mp_message_delete(message);
			
 
				+                /* Take it again */
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				 	}
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 }
			
@@ -167,8 +214,25 @@ int _starpu_src_common_store_message(struct _starpu_mp_node *node,
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 			return 1;
			
 
				 			break;
			
 
				-		default:
			
 
				-			return 0;
			
 
				+                        /* For ASYNC commands don't store them, update event */
			
 
				+                case STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED:
			
 
				+                case STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC_COMPLETED:
			
 
				+                        {
			
 
				+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
			
 
				+                                event->starpu_mp_common_finished_receiver--;
			
 
				+                                return 1;
			
 
				+                                break;
			
 
				+                        }
			
 
				+                case STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC_COMPLETED:
			
 
				+                case STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC_COMPLETED:
			
 
				+                        {
			
 
				+                                struct _starpu_async_channel * event = *((struct _starpu_async_channel **) arg);
			
 
				+                                event->starpu_mp_common_finished_sender--;
			
 
				+                                return 1;
			
 
				+                                break;
			
 
				+                        }
			
 
				+                default:
			
 
				+                        return 0;
			
 
				 			break;
			
 
				 	}
			
 
				 }
			
@@ -195,7 +259,7 @@ static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
 
				 	void *arg;
			
 
				 	int arg_size;
			
 
				 	answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
			
 
				-	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer))
			
 
				+	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer, 0))
			
 
				 	{
			
 
				 		printf("incorrect commande: unknown command or sync command");
			
 
				 		STARPU_ASSERT(0);
			
@@ -237,13 +301,15 @@ static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
 
				 
			
 
				 
			
 
				 /* Send a request to the sink NODE for the number of cores on it. */
			
 
				-int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
			
 
				+int _starpu_src_common_sink_nbcores (struct _starpu_mp_node *node, int *buf)
			
 
				 {
			
 
				 
			
 
				 	enum _starpu_mp_command answer;
			
 
				 	void *arg;
			
 
				 	int arg_size = sizeof (int);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+
			
 
				 	_starpu_mp_common_send_command (node, STARPU_MP_COMMAND_SINK_NBCORES, NULL, 0);
			
 
				 
			
 
				 	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
			
@@ -252,6 +318,8 @@ int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *bu
 
				 
			
 
				 	memcpy (buf, arg, arg_size);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -270,6 +338,8 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
				 	/* strlen ignore the terminating '\0' */
			
 
				 	arg_size = (strlen(func_name) + 1) * sizeof(char);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+
			
 
				 	//_STARPU_DEBUG("Looking up %s\n", func_name);
			
 
				 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_LOOKUP, (void *) func_name,
			
 
				 			arg_size);
			
@@ -277,9 +347,11 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
				 	answer = _starpu_src_common_wait_command_sync(node, (void **) &arg,
			
 
				 			&arg_size);
			
 
				 
			
 
				+
			
 
				 	if (answer == STARPU_MP_COMMAND_ERROR_LOOKUP)
			
 
				 	{
			
 
				 		_STARPU_DISP("Error looking up symbol %s\n", func_name);
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				 		return -ESPIPE;
			
 
				 	}
			
 
				 
			
@@ -290,6 +362,8 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
				 
			
 
				 	memcpy(func_ptr, arg, arg_size);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+
			
 
				 	//_STARPU_DEBUG("got %p\n", *func_ptr);
			
 
				 
			
 
				 	return 0;
			
@@ -314,7 +388,6 @@ int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
 
				 		unsigned nb_interfaces,
			
 
				 		void *cl_arg, size_t cl_arg_size)
			
 
				 {
			
 
				-
			
 
				 	void *buffer, *arg =NULL;
			
 
				 	uintptr_t buffer_ptr;
			
 
				 	int buffer_size = 0, arg_size =0;
			
@@ -384,14 +457,22 @@ int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
 
				 	if (cl_arg)
			
 
				 		memcpy((void*) buffer_ptr, cl_arg, cl_arg_size);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+
			
 
				 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_EXECUTE, buffer, buffer_size);
			
 
				+
			
 
				 	enum _starpu_mp_command answer = _starpu_src_common_wait_command_sync(node, &arg, &arg_size);
			
 
				 
			
 
				-	if (answer == STARPU_MP_COMMAND_ERROR_EXECUTE)
			
 
				-		return -EINVAL;
			
 
				+        if (answer == STARPU_MP_COMMAND_ERROR_EXECUTE)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+                return -EINVAL;
			
 
				+        }
			
 
				 
			
 
				 	STARPU_ASSERT(answer == STARPU_MP_COMMAND_EXECUTION_SUBMITTED);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+
			
 
				 	free(buffer);
			
 
				 
			
 
				 	return 0;
			
@@ -451,85 +532,230 @@ int _starpu_src_common_allocate(struct _starpu_mp_node *mp_node,
 
				 	void *arg;
			
 
				 	int arg_size;
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				 	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_ALLOCATE, &size,
			
 
				 			sizeof(size));
			
 
				 
			
 
				 	answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
			
 
				 
			
 
				-	if (answer == STARPU_MP_COMMAND_ERROR_ALLOCATE)
			
 
				-		return 1;
			
 
				+        if (answer == STARPU_MP_COMMAND_ERROR_ALLOCATE)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+                return 1;
			
 
				+        }
			
 
				 
			
 
				 	STARPU_ASSERT(answer == STARPU_MP_COMMAND_ANSWER_ALLOCATE &&
			
 
				 			arg_size == sizeof(*addr));
			
 
				-
			
 
				+    
			
 
				 	memcpy(addr, arg, arg_size);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 /* Send a request to the sink linked to the MP_NODE to deallocate the memory
			
 
				  * area pointed by ADDR.
			
 
				  */
			
 
				-void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
			
 
				+void _starpu_src_common_free(struct _starpu_mp_node *mp_node,
			
 
				 		void *addr)
			
 
				 {
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_FREE, &addr, sizeof(addr));
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_FREE, &addr, sizeof(addr));
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				 }
			
 
				 
			
 
				-/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
			
 
				-*/
			
 
				-int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				+/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE with a
			
 
				+ * synchronous mode.
			
 
				+ */
			
 
				+int _starpu_src_common_copy_host_to_sink_sync(struct _starpu_mp_node *mp_node,
			
 
				 		void *src, void *dst, size_t size)
			
 
				 {
			
 
				-	struct _starpu_mp_transfer_command cmd = {size, dst};
			
 
				+        struct _starpu_mp_transfer_command cmd = {size, dst, NULL};
			
 
				 
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST, &cmd, sizeof(cmd));
			
 
				-	mp_node->dt_send(mp_node, src, size);
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				 
			
 
				-	return 0;
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        mp_node->dt_send(mp_node, src, size, NULL);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        return 0;
			
 
				 }
			
 
				 
			
 
				-/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
			
 
				-*/
			
 
				-int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
			
 
				+/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE with an
			
 
				+ * asynchronous mode.
			
 
				+ */
			
 
				+int _starpu_src_common_copy_host_to_sink_async(struct _starpu_mp_node *mp_node,
			
 
				+		void *src, void *dst, size_t size, void * event)
			
 
				+{
			
 
				+        struct _starpu_mp_transfer_command cmd = {size, dst, event};
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        /* For asynchronous transfers, we save informations
			
 
				+         * to test is they are finished
			
 
				+         */
			
 
				+        struct _starpu_async_channel * async_channel = event;
			
 
				+        async_channel->polling_node_receiver = mp_node;
			
 
				+
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        mp_node->dt_send(mp_node, src, size, event);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST
			
 
				+ * with a synchronous mode.
			
 
				+ */
			
 
				+int _starpu_src_common_copy_sink_to_host_sync(struct _starpu_mp_node *mp_node,
			
 
				 		void *src, void *dst, size_t size)
			
 
				 {
			
 
				-	struct _starpu_mp_transfer_command cmd = {size, src};
			
 
				+        enum _starpu_mp_command answer;
			
 
				+        void *arg;
			
 
				+        int arg_size;
			
 
				+        struct _starpu_mp_transfer_command cmd = {size, src, NULL};
			
 
				 
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, &cmd, sizeof(cmd));
			
 
				-	mp_node->dt_recv(mp_node, dst, size);
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				 
			
 
				-	return 0;
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
			
 
				+
			
 
				+        STARPU_ASSERT(answer == STARPU_MP_COMMAND_SEND_TO_HOST);
			
 
				+
			
 
				+        mp_node->dt_recv(mp_node, dst, size, NULL);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				+/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST
			
 
				+ * with an asynchronous mode.
			
 
				+ */
			
 
				+int _starpu_src_common_copy_sink_to_host_async(struct _starpu_mp_node *mp_node,
			
 
				+		void *src, void *dst, size_t size, void * event)
			
 
				+{
			
 
				+        struct _starpu_mp_transfer_command cmd = {size, src, event};
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        /* For asynchronous transfers, we save informations
			
 
				+         * to test is they are finished
			
 
				+         */
			
 
				+        struct _starpu_async_channel * async_channel = event;
			
 
				+        async_channel->polling_node_sender = mp_node;
			
 
				+
			
 
				+        _starpu_mp_common_send_command(mp_node, STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        mp_node->dt_recv(mp_node, dst, size, event);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				 /* Tell the sink linked to SRC_NODE to send SIZE bytes of data pointed by SRC
			
 
				- * to the sink linked to DST_NODE. The latter store them in DST.
			
 
				+ * to the sink linked to DST_NODE. The latter store them in DST with a synchronous
			
 
				+ * mode.
			
 
				  */
			
 
				-int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
			
 
				-		const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size)
			
 
				+int _starpu_src_common_copy_sink_to_sink_sync(struct _starpu_mp_node *src_node,
			
 
				+		struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size)
			
 
				 {
			
 
				-	enum _starpu_mp_command answer;
			
 
				-	void *arg;
			
 
				-	int arg_size;
			
 
				+        enum _starpu_mp_command answer;
			
 
				+        void *arg;
			
 
				+        int arg_size;
			
 
				 
			
 
				-	struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src};
			
 
				+        struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src, NULL};
			
 
				 
			
 
				-	/* Tell source to send data to dest. */
			
 
				-	_starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK, &cmd, sizeof(cmd));
			
 
				+        /* lock the node with the little peer_id first to prevent deadlock */
			
 
				+        if (src_node->peer_id > dst_node->peer_id)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
			
 
				+        }
			
 
				 
			
 
				-	cmd.devid = src_node->peer_id;
			
 
				-	cmd.size = size;
			
 
				-	cmd.addr = dst;
			
 
				+        /* Tell source to send data to dest. */
			
 
				+        _starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK, &cmd, sizeof(cmd));
			
 
				 
			
 
				-	/* Tell dest to receive data from source. */
			
 
				-	_starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK, &cmd, sizeof(cmd));
			
 
				+        /* Release the source as fast as possible */
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&src_node->connection_mutex);
			
 
				 
			
 
				-	/* Wait for answer from dest to know wether transfer is finished. */
			
 
				-	answer = _starpu_mp_common_recv_command(dst_node, &arg, &arg_size);
			
 
				+        cmd.devid = src_node->peer_id;
			
 
				+        cmd.size = size;
			
 
				+        cmd.addr = dst;
			
 
				 
			
 
				-	STARPU_ASSERT(answer == STARPU_MP_COMMAND_TRANSFER_COMPLETE);
			
 
				+        /* Tell dest to receive data from source. */
			
 
				+        _starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK, &cmd, sizeof(cmd));
			
 
				 
			
 
				-	return 0;
			
 
				+        /* Wait for answer from dest to know wether transfer is finished. */
			
 
				+        answer = _starpu_src_common_wait_command_sync(dst_node, &arg, &arg_size);
			
 
				+
			
 
				+        STARPU_ASSERT(answer == STARPU_MP_COMMAND_TRANSFER_COMPLETE);
			
 
				+
			
 
				+        /* Release the receiver when we received the acknowlegment */
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&dst_node->connection_mutex);
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				+/* Tell the sink linked to SRC_NODE to send SIZE bytes of data pointed by SRC
			
 
				+ * to the sink linked to DST_NODE. The latter store them in DST with an asynchronous
			
 
				+ * mode.
			
 
				+ */
			
 
				+int _starpu_src_common_copy_sink_to_sink_async(struct _starpu_mp_node *src_node,
			
 
				+		struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size, void * event)
			
 
				+{
			
 
				+        struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src, event};
			
 
				+
			
 
				+        /* lock the node with the little peer_id first to prevent deadlock */
			
 
				+        if (src_node->peer_id > dst_node->peer_id)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&src_node->connection_mutex);
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&dst_node->connection_mutex);
			
 
				+        }
			
 
				+
			
 
				+        /* For asynchronous transfers, we save informations
			
 
				+         * to test is they are finished
			
 
				+         */
			
 
				+        struct _starpu_async_channel * async_channel = event;
			
 
				+        async_channel->polling_node_sender = src_node; 
			
 
				+        async_channel->polling_node_receiver = dst_node; 
			
 
				+        /* Increase number of ack waited */
			
 
				+        async_channel->starpu_mp_common_finished_receiver++;
			
 
				+        async_channel->starpu_mp_common_finished_sender++;
			
 
				+
			
 
				+        /* Tell source to send data to dest. */
			
 
				+        _starpu_mp_common_send_command(src_node, STARPU_MP_COMMAND_SEND_TO_SINK_ASYNC, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&src_node->connection_mutex);
			
 
				+
			
 
				+        cmd.devid = src_node->peer_id;
			
 
				+        cmd.size = size;
			
 
				+        cmd.addr = dst;
			
 
				+
			
 
				+        /* Tell dest to receive data from source. */
			
 
				+        _starpu_mp_common_send_command(dst_node, STARPU_MP_COMMAND_RECV_FROM_SINK_ASYNC, &cmd, sizeof(cmd));
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&dst_node->connection_mutex);
			
 
				+
			
 
				+        return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				 /* 5 functions to determine the executable to run on the device (MIC, SCC,
			
@@ -643,6 +869,44 @@ int _starpu_src_common_locate_file(char *located_file_name,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+void _starpu_src_common_init_switch_env(unsigned this)
			
 
				+{
			
 
				+        save_thread_env[this].current_task = starpu_task_get_current();
			
 
				+        save_thread_env[this].current_worker = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_key);
			
 
				+        save_thread_env[this].current_worker_set = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_set_key);
			
 
				+        save_thread_env[this].current_mem_node = STARPU_PTHREAD_GETSPECIFIC(_starpu_memory_node_key);
			
 
				+#ifdef STARPU_OPENMP
			
 
				+        save_thread_env[this].current_omp_thread = STARPU_PTHREAD_GETSPECIFIC(omp_thread_key);
			
 
				+        save_thread_env[this].current_omp_task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void _starpu_src_common_switch_env(unsigned old, unsigned new)
			
 
				+{
			
 
				+        save_thread_env[old].current_task = starpu_task_get_current();
			
 
				+        save_thread_env[old].current_worker = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_key);
			
 
				+        save_thread_env[old].current_worker_set = STARPU_PTHREAD_GETSPECIFIC(_starpu_worker_set_key);
			
 
				+        save_thread_env[old].current_mem_node = STARPU_PTHREAD_GETSPECIFIC(_starpu_memory_node_key);
			
 
				+#ifdef STARPU_OPENMP
			
 
				+        save_thread_env[old].current_omp_thread = STARPU_PTHREAD_GETSPECIFIC(omp_thread_key);
			
 
				+        save_thread_env[old].current_omp_task = STARPU_PTHREAD_GETSPECIFIC(omp_task_key);
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+        _starpu_set_current_task(save_thread_env[new].current_task);
			
 
				+        STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_key, save_thread_env[new].current_worker);
			
 
				+        STARPU_PTHREAD_SETSPECIFIC(_starpu_worker_set_key, save_thread_env[new].current_worker_set);
			
 
				+        STARPU_PTHREAD_SETSPECIFIC(_starpu_memory_node_key, save_thread_env[new].current_mem_node);
			
 
				+#ifdef STARPU_OPENMP
			
 
				+        STARPU_PTHREAD_SETSPECIFIC(omp_thread_key, save_thread_env[new].current_omp_thread);
			
 
				+        STARPU_PTHREAD_SETSPECIFIC(omp_task_key, save_thread_env[new].current_omp_task); 
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				 /* Send workers to the sink node
			
 
				  */
			
 
				 static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int baseworkerid, int nworkers)
			
@@ -657,96 +921,234 @@ static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int b
 
				 	msg[3] = baseworkerid;
			
 
				 	msg[4] = starpu_worker_get_count();
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+
			
 
				 	/* tell the sink node that we will send him all workers */
			
 
				 	_starpu_mp_common_send_command(node, STARPU_MP_COMMAND_SYNC_WORKERS,
			
 
				 			&msg, sizeof(msg));
			
 
				 
			
 
				 	/* Send all worker to the sink node */
			
 
				-	node->dt_send(node,&config->workers[baseworkerid],worker_size);
			
 
				+	node->dt_send(node,&config->workers[baseworkerid],worker_size, NULL);
			
 
				 
			
 
				 	/* Send all combined workers to the sink node */
			
 
				-	node->dt_send(node, &config->combined_workers,combined_worker_size);
			
 
				+	node->dt_send(node, &config->combined_workers,combined_worker_size, NULL);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				 }
			
 
				 
			
 
				-/* Function looping on the source node */
			
 
				-void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
			
 
				-		unsigned baseworkerid,
			
 
				-		struct _starpu_mp_node * mp_node)
			
 
				+/* Callback used when a buffer is send asynchronously to the sink */
			
 
				+static void _starpu_src_common_send_data_callback(void *arg)
			
 
				 {
			
 
				-	unsigned memnode = worker_set->workers[0].memory_node;
			
 
				-	struct starpu_task **tasks;
			
 
				+        struct _starpu_worker * worker = (struct _starpu_worker *) arg;
			
 
				 
			
 
				-	_STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*worker_set->nworkers);
			
 
				+        /* increase the number of buffer received */
			
 
				+        STARPU_WMB();
			
 
				+        (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_sent, 1);
			
 
				+}
			
 
				 
			
 
				-	_starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
			
 
				 
			
 
				-	/*main loop*/
			
 
				-	while (_starpu_machine_is_running())
			
 
				-	{
			
 
				-		int res = 0;
			
 
				+static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set * worker_set, struct _starpu_mp_node * mp_node, struct starpu_task **tasks, unsigned memnode)
			
 
				+{
			
 
				+        int res = 0;
			
 
				 
			
 
				-		_starpu_may_pause();
			
 
				+        _starpu_may_pause();
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-		starpu_pthread_wait_reset(&worker_set->workers[0].wait);
			
 
				+        starpu_pthread_wait_reset(&worker_set->workers[0].wait);
			
 
				 #endif
			
 
				 
			
 
				-		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				-		res |= __starpu_datawizard_progress(memnode, 1, 1);
			
 
				-		res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
			
 
				-		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 
			
 
				-		/* Handle message which have been store */
			
 
				-		_starpu_src_common_handle_stored_async(mp_node);
			
 
				+        /* Test if async transfers are completed */
			
 
				+        for (unsigned i = 0; i < worker_set->nworkers; i++)
			
 
				+        {
			
 
				+                /* We send all buffers to execute the task */
			
 
				+                if (worker_set->workers[i].task_sending != NULL && worker_set->workers[i].nb_buffers_sent == STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending))
			
 
				+                {
			
 
				+                        int workerid = worker_set->workers[i].workerid;
			
 
				+
			
 
				+                        STARPU_RMB();
			
 
				+                        _STARPU_TRACE_WORKER_END_FETCH_INPUT(NULL, workerid);
			
 
				+
			
 
				+                        unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(worker_set->workers[i].task_sending);
			
 
				+                        unsigned buf;
			
 
				+                        for (buf = 0; buf < nbuffers; buf++)
			
 
				+                        {
			
 
				+                                starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(worker_set->workers[i].task_sending, buf);
			
 
				+                                struct _starpu_data_replicate *replicate = &handle->per_node[memnode];
			
 
				+                                /* Release our refcnt */
			
 
				+                                _starpu_spin_lock(&handle->header_lock);
			
 
				+                                replicate->refcnt--;
			
 
				+                                STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				+                                STARPU_ASSERT(handle->busy_count > 0);
			
 
				+                                handle->busy_count--;
			
 
				+                                if (!_starpu_data_check_not_busy(handle))
			
 
				+                                        _starpu_spin_unlock(&handle->header_lock);
			
 
				+                        }
			
 
				+
			
 
				+                        /* Execute the task */
			
 
				+                        struct _starpu_job * j = _starpu_get_job_associated_to_task(worker_set->workers[i].task_sending);
			
 
				+                        _starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				+                        res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
			
 
				+                        switch (res)
			
 
				+                        {
			
 
				+                                case 0:
			
 
				+                                        /* The task task has been launched with no error */
			
 
				+                                        break;
			
 
				+                                case -EAGAIN:
			
 
				+                                        _STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
			
 
				+                                        _starpu_push_task_to_workers(worker_set->workers[i].task_sending);
			
 
				+                                        STARPU_ABORT();
			
 
				+                                        continue;
			
 
				+                                        break;
			
 
				+                                default:
			
 
				+                                        STARPU_ASSERT(0);
			
 
				+                        }
			
 
				+
			
 
				+                        /* Reset it */
			
 
				+                        worker_set->workers[i].task_sending = NULL;
			
 
				+                        worker_set->workers[i].nb_buffers_sent = 0;
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        _STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+        res |= __starpu_datawizard_progress(1, 1);
			
 
				+        _STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+
			
 
				+        /* Handle message which have been store */
			
 
				+        _starpu_src_common_handle_stored_async(mp_node);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        /* poll the device for completed jobs.*/
			
 
				+        while(mp_node->mp_recv_is_ready(mp_node))
			
 
				+        {
			
 
				+                _starpu_src_common_recv_async(mp_node);
			
 
				+                /* Mutex is unlock in _starpu_src_common_recv_async */
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&mp_node->connection_mutex);
			
 
				+        }
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&mp_node->connection_mutex);
			
 
				+
			
 
				+        /* get task for each worker*/
			
 
				+        res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
			
 
				 
			
 
				-		/* poll the device for completed jobs.*/
			
 
				-		while(mp_node->mp_recv_is_ready(mp_node))
			
 
				-			_starpu_src_common_recv_async(mp_node);
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+        if (!res)
			
 
				+                starpu_pthread_wait_wait(&worker_set->workers[0].wait);
			
 
				+#endif
			
 
				 
			
 
				-		/* get task for each worker*/
			
 
				-		res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
			
 
				+        /*if at least one worker have pop a task*/
			
 
				+        if(res != 0)
			
 
				+        {
			
 
				+                unsigned i, buf;
			
 
				+                for(i=0; i<worker_set->nworkers; i++)
			
 
				+                {
			
 
				+                        if(tasks[i] != NULL)
			
 
				+                        {
			
 
				+                                int workerid = worker_set->workers[i].workerid;
			
 
				+                                _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				+                                unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(tasks[i]);
			
 
				+
			
 
				+                                for (buf = 0; buf < nbuffers; buf++)
			
 
				+                                {
			
 
				+                                        starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(tasks[i], buf);
			
 
				+                                        enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(tasks[i], buf);
			
 
				+                                        struct _starpu_data_replicate *local_replicate = get_replicate(handle, mode, workerid, memnode);
			
 
				+
			
 
				+                                        int ret = _starpu_fetch_data_on_node(handle, memnode, local_replicate, mode, 0, 0, 1,
			
 
				+                                                        _starpu_src_common_send_data_callback, &worker_set->workers[i], 0, "_starpu_src_common_worker_internal_work");
			
 
				+                                        STARPU_ASSERT(!ret);
			
 
				+                                }
			
 
				+                                worker_set->workers[i].task_sending = tasks[i];
			
 
				+                        }
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        /* Handle message which have been store */
			
 
				+        _starpu_src_common_handle_stored_async(mp_node);
			
 
				 
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-		if (!res)
			
 
				-			starpu_pthread_wait_wait(&worker_set->workers[0].wait);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+/* Function looping on the source node */
			
 
				+void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set,
			
 
				+        int ndevices, struct _starpu_mp_node ** mp_node)
			
 
				+{
			
 
				+        unsigned memnode[ndevices];
			
 
				+        unsigned offsetmemnode[ndevices];
			
 
				+        memset(offsetmemnode, 0, ndevices*sizeof(unsigned));
			
 
				+
			
 
				+        int device;
			
 
				+        int nbworkers = 0;
			
 
				+        for (device = 0; device < ndevices; device++)
			
 
				+        {
			
 
				+                memnode[device] = worker_set[device].workers[0].memory_node;
			
 
				+                nbworkers += worker_set[device].nworkers;
			
 
				+                if (device != 0)
			
 
				+                        offsetmemnode[device] += offsetmemnode[device-1];
			
 
				+                if (device != ndevices -1)
			
 
				+                        offsetmemnode[device+1] += worker_set[device].nworkers;
			
 
				+        }
			
 
				+
			
 
				+        struct starpu_task **tasks;
			
 
				+        _STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*nbworkers);
			
 
				+
			
 
				+        for (device = 0; device < ndevices; device++)
			
 
				+        {
			
 
				+                struct _starpu_worker *baseworker = &worker_set[device].workers[0];
			
 
				+                struct _starpu_machine_config *config = baseworker->config;
			
 
				+                unsigned baseworkerid = baseworker - config->workers;
			
 
				+                _starpu_src_common_send_workers(mp_node[device], baseworkerid, worker_set[device].nworkers);
			
 
				+        }
			
 
				+
			
 
				+        /*main loop*/
			
 
				+        while (_starpu_machine_is_running())
			
 
				+        {
			
 
				+                for (device = 0; device < ndevices ; device++)
			
 
				+                {
			
 
				+                        _starpu_src_common_switch_env(((device-1)+ndevices)%ndevices, device);
			
 
				+                        _starpu_src_common_worker_internal_work(&worker_set[device], mp_node[device], tasks+offsetmemnode[device], memnode[device]);
			
 
				+                }
			
 
				+        }
			
 
				+        free(tasks);
			
 
				+
			
 
				+        for (device = 0; device < ndevices; device++)
			
 
				+                _starpu_handle_all_pending_node_data_requests(memnode[device]);
			
 
				+
			
 
				+        /* In case there remains some memory that was automatically
			
 
				+         * allocated by StarPU, we release it now. Note that data
			
 
				+         * coherency is not maintained anymore at that point ! */
			
 
				+        for (device = 0; device < ndevices; device++)
			
 
				+                _starpu_free_all_automatically_allocated_buffers(memnode[device]);
			
 
				+
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				-		/*if at least one worker have pop a task*/
			
 
				-		if(res != 0)
			
 
				-		{
			
 
				-			unsigned i;
			
 
				-			for(i=0; i<worker_set->nworkers; i++)
			
 
				-			{
			
 
				-				if(tasks[i] != NULL)
			
 
				-				{
			
 
				-					struct _starpu_job * j = _starpu_get_job_associated_to_task(tasks[i]);
			
 
				-					_starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				-					res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
			
 
				-					switch (res)
			
 
				-					{
			
 
				-						case 0:
			
 
				-							/* The task task has been launched with no error */
			
 
				-							break;
			
 
				-						case -EAGAIN:
			
 
				-							_STARPU_DISP("ouch, this MP worker could not actually run task %p, putting it back...\n", tasks[i]);
			
 
				-							_starpu_push_task_to_workers(tasks[i]);
			
 
				-							STARPU_ABORT();
			
 
				-							continue;
			
 
				-							break;
			
 
				-						default:
			
 
				-							STARPU_ASSERT(0);
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	free(tasks);
			
 
				+/* Function looping on the source node */
			
 
				+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
			
 
				+		unsigned baseworkerid,
			
 
				+		struct _starpu_mp_node * mp_node)
			
 
				+{
			
 
				+        unsigned memnode = worker_set->workers[0].memory_node;
			
 
				+        struct starpu_task **tasks;
			
 
				+
			
 
				+        _STARPU_MALLOC(tasks, sizeof(struct starpu_task *)*worker_set->nworkers);
			
 
				+
			
 
				+        _starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
			
 
				+
			
 
				+        /*main loop*/
			
 
				+        while (_starpu_machine_is_running())
			
 
				+        {
			
 
				+                _starpu_src_common_worker_internal_work(worker_set, mp_node, tasks, memnode);
			
 
				+        }
			
 
				+        free(tasks);
			
 
				 
			
 
				-	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+        _starpu_handle_all_pending_node_data_requests(memnode);
			
 
				 
			
 
				-	/* In case there remains some memory that was automatically
			
 
				-	 * allocated by StarPU, we release it now. Note that data
			
 
				-	 * coherency is not maintained anymore at that point ! */
			
 
				-	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				+        /* In case there remains some memory that was automatically
			
 
				+         * allocated by StarPU, we release it now. Note that data
			
 
				+         * coherency is not maintained anymore at that point ! */
			
 
				+        _starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				 
			
 
				 }
			
--- a/src/drivers/mp_common/source_common.h
+++ b/src/drivers/mp_common/source_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,15 +28,12 @@
 
				 
			
 
				 enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
			
 
				 							     void ** arg, int* arg_size);
			
 
				-void _starpu_src_common_recv_async(struct _starpu_worker_set *worker_set, 
			
 
				-				   struct _starpu_mp_node * baseworker_node);
			
 
				-
			
 
				 int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
			
 
				 		void * arg, int arg_size, enum _starpu_mp_command answer);
			
 
				 
			
 
				 enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size);
			
 
				 
			
 
				-int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
			
 
				+int _starpu_src_common_sink_nbcores (struct _starpu_mp_node *node, int *buf);
			
 
				 
			
 
				 int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
			
 
				 			      void (**func_ptr)(void), const char *func_name);
			
@@ -44,7 +41,7 @@ int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
 
				 int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
			
 
				 				void **addr, size_t size);
			
 
				 
			
 
				-void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
			
 
				+void _starpu_src_common_free(struct _starpu_mp_node *mp_node,
			
 
				 			     void *addr);
			
 
				 
			
 
				 int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
			
@@ -57,14 +54,23 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 
				 				      void *cl_arg, size_t cl_arg_size);
			
 
				 
			
 
				 
			
 
				-int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				+int _starpu_src_common_copy_host_to_sink_sync(struct _starpu_mp_node *mp_node,
			
 
				 					 void *src, void *dst, size_t size);
			
 
				 
			
 
				-int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
			
 
				+int _starpu_src_common_copy_sink_to_host_sync(struct _starpu_mp_node *mp_node,
			
 
				 					 void *src, void *dst, size_t size);
			
 
				 
			
 
				-int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
			
 
				-					 const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size);
			
 
				+int _starpu_src_common_copy_sink_to_sink_sync(struct _starpu_mp_node *src_node,
			
 
				+					 struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size);
			
 
				+
			
 
				+int _starpu_src_common_copy_host_to_sink_async(struct _starpu_mp_node *mp_node,
			
 
				+					 void *src, void *dst, size_t size, void *event);
			
 
				+
			
 
				+int _starpu_src_common_copy_sink_to_host_async(struct _starpu_mp_node *mp_node,
			
 
				+					 void *src, void *dst, size_t size, void *event);
			
 
				+
			
 
				+int _starpu_src_common_copy_sink_to_sink_async(struct _starpu_mp_node *src_node,
			
 
				+					 struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size, void *event);
			
 
				 
			
 
				 int _starpu_src_common_locate_file(char *located_file_name,
			
 
				 				   const char *env_file_name, const char *env_mic_path,
			
@@ -75,6 +81,12 @@ void _starpu_src_common_worker(struct _starpu_worker_set * worker_set,
 
				 			       unsigned baseworkerid, 
			
 
				 			       struct _starpu_mp_node * node_set);
			
 
				 
			
 
				+#if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+void _starpu_src_common_init_switch_env(unsigned this);
			
 
				+void _starpu_src_common_workers_set(struct _starpu_worker_set * worker_set,
			
 
				+                 int ndevices,
			
 
				+                 struct _starpu_mp_node ** mp_node);
			
 
				+#endif
			
 
				 
			
 
				 #endif /* STARPU_USE_MP */
			
 
				 
			
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -0,0 +1,558 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <drivers/mp_common/source_common.h>
			
 
				+#include "driver_mpi_common.h"
			
 
				+
			
 
				+#define NITER 32
			
 
				+#define SIZE_BANDWIDTH (1024*1024)
			
 
				+
			
 
				+#define DRIVER_MPI_MASTER_NODE_DEFAULT 0
			
 
				+
			
 
				+static int mpi_initialized = 0;
			
 
				+static int extern_initialized = 0;
			
 
				+static int src_node_id;
			
 
				+
			
 
				+static void _starpu_mpi_set_src_node_id()
			
 
				+{
			
 
				+        int node_id = starpu_get_env_number("STARPU_MPI_MASTER_NODE");
			
 
				+
			
 
				+        if (node_id != -1)
			
 
				+        {
			
 
				+                int nb_proc, id_proc;
			
 
				+                MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
			
 
				+                MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+                if (node_id < nb_proc)
			
 
				+                {
			
 
				+                        src_node_id = node_id;
			
 
				+                        return;
			
 
				+                }
			
 
				+                else if (id_proc == DRIVER_MPI_MASTER_NODE_DEFAULT)
			
 
				+                {
			
 
				+                        /* Only one node prints the error message. */
			
 
				+                        _STARPU_DISP("The node you specify to be the master is "
			
 
				+                                        "greater than the total number of nodes.\n"
			
 
				+                                        "Taking node %d by default...\n", DRIVER_MPI_MASTER_NODE_DEFAULT);
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        /* Node by default. */
			
 
				+        src_node_id = DRIVER_MPI_MASTER_NODE_DEFAULT;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_common_mp_init()
			
 
				+{
			
 
				+        //Here we supposed the programmer called two times starpu_init.
			
 
				+        if (mpi_initialized)
			
 
				+                return -ENODEV;
			
 
				+
			
 
				+        mpi_initialized = 1;
			
 
				+
			
 
				+        if (MPI_Initialized(&extern_initialized) != MPI_SUCCESS)
			
 
				+                STARPU_ABORT_MSG("Cannot check if MPI is initialized or not !");
			
 
				+
			
 
				+        //Here MPI_Init or MPI_Init_thread is already called
			
 
				+        if (!extern_initialized)
			
 
				+        {
			
 
				+
			
 
				+#if defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				+                int required = MPI_THREAD_MULTIPLE;
			
 
				+#else
			
 
				+                int required = MPI_THREAD_FUNNELED;
			
 
				+#endif
			
 
				+
			
 
				+                int thread_support;
			
 
				+                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
			
 
				+
			
 
				+                if (thread_support != required)
			
 
				+                {
			
 
				+                        if (required == MPI_THREAD_MULTIPLE)
			
 
				+                                _STARPU_DISP("MPI doesn't support MPI_THREAD_MULTIPLE option. MPI Master-Slave can have problems if multiple slaves are launched. \n");
			
 
				+                        if (required == MPI_THREAD_FUNNELED)
			
 
				+                                _STARPU_DISP("MPI doesn't support MPI_THREAD_FUNNELED option. Many errors can occur. \n");
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        /* Find which node is the master */
			
 
				+        _starpu_mpi_set_src_node_id();
			
 
				+
			
 
				+        return 1;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_common_mp_deinit()
			
 
				+{
			
 
				+        if (!extern_initialized)
			
 
				+                MPI_Finalize();    
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_common_is_src_node()
			
 
				+{   
			
 
				+        int id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+        return id_proc == src_node_id;
			
 
				+} 
			
 
				+
			
 
				+int _starpu_mpi_common_get_src_node()
			
 
				+{
			
 
				+        return src_node_id;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_common_is_mp_initialized()
			
 
				+{
			
 
				+        return mpi_initialized;
			
 
				+}
			
 
				+
			
 
				+/* common parts to initialize a source or a sink node */
			
 
				+void _starpu_mpi_common_mp_initialize_src_sink(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+        struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
			
 
				+
			
 
				+        node->nb_cores = topology->nhwcpus;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
			
 
				+{
			
 
				+        int res, source;
			
 
				+        int flag = 0;
			
 
				+        int id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+        if (id_proc == src_node_id)
			
 
				+        {
			
 
				+                /* Source has mp_node defined */
			
 
				+                source = mp_node->mp_connection.mpi_remote_nodeid;
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+                /* Sink can have sink to sink message */
			
 
				+                source = MPI_ANY_SOURCE;
			
 
				+        }
			
 
				+
			
 
				+        res = MPI_Iprobe(source, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
			
 
				+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot test if we received a message !");
			
 
				+
			
 
				+        return flag;
			
 
				+}
			
 
				+
			
 
				+/* SEND to source node */
			
 
				+void _starpu_mpi_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event)
			
 
				+{
			
 
				+        int res;
			
 
				+        int id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+        //printf("envoi %d B to %d\n", len, node->mp_connection.mpi_remote_nodeid);
			
 
				+
			
 
				+        if (event)
			
 
				+        {
			
 
				+                /* Asynchronous send */
			
 
				+                struct _starpu_async_channel * channel = event;
			
 
				+                channel->event.mpi_ms_event.is_sender = 1;
			
 
				+
			
 
				+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
			
 
				+                if (channel->type == STARPU_UNUSED)
			
 
				+                        channel->event.mpi_ms_event.requests = NULL;
			
 
				+
			
 
				+                /* Initialize the list */
			
 
				+                if (channel->event.mpi_ms_event.requests == NULL)
			
 
				+                {
			
 
				+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
			
 
				+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
			
 
				+                }
			
 
				+
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
			
 
				+
			
 
				+                res = MPI_Isend(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
			
 
				+
			
 
				+                channel->starpu_mp_common_finished_receiver++;
			
 
				+                channel->starpu_mp_common_finished_sender++;
			
 
				+
			
 
				+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
			
 
				+        } 
			
 
				+        else
			
 
				+        {
			
 
				+                /* Synchronous send */
			
 
				+                res = MPI_Send(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, SYNC_TAG, MPI_COMM_WORLD);
			
 
				+        }
			
 
				+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_common_mp_send(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				+{
			
 
				+        _starpu_mpi_common_send(node, msg, len, NULL);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* RECV to source node */
			
 
				+void _starpu_mpi_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
			
 
				+{
			
 
				+        int res;
			
 
				+        int id_proc;
			
 
				+        MPI_Status s;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+        //printf("recv %d B from %d in %p\n", len, node->mp_connection.mpi_remote_nodeid, msg);
			
 
				+
			
 
				+        if (event)
			
 
				+        {
			
 
				+                /* Asynchronous recv */
			
 
				+                struct _starpu_async_channel * channel = event;
			
 
				+                channel->event.mpi_ms_event.is_sender = 0;
			
 
				+
			
 
				+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
			
 
				+                if (channel->type == STARPU_UNUSED)
			
 
				+                        channel->event.mpi_ms_event.requests = NULL;
			
 
				+
			
 
				+                /* Initialize the list */
			
 
				+                if (channel->event.mpi_ms_event.requests == NULL)
			
 
				+                {
			
 
				+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
			
 
				+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
			
 
				+                }
			
 
				+
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
			
 
				+
			
 
				+                res = MPI_Irecv(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
			
 
				+
			
 
				+                channel->starpu_mp_common_finished_receiver++;
			
 
				+                channel->starpu_mp_common_finished_sender++;
			
 
				+
			
 
				+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
			
 
				+        } 
			
 
				+        else
			
 
				+        {
			
 
				+                /* Synchronous recv */
			
 
				+                res = MPI_Recv(msg, len, MPI_BYTE, node->mp_connection.mpi_remote_nodeid, SYNC_TAG, MPI_COMM_WORLD, &s);
			
 
				+                int num_expected;
			
 
				+                MPI_Get_count(&s, MPI_BYTE, &num_expected);
			
 
				+
			
 
				+                STARPU_ASSERT_MSG(num_expected == len, "MPI Master/Slave received a msg with a size of %d Bytes (expected %d Bytes) !", num_expected, len);
			
 
				+        }
			
 
				+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				+{
			
 
				+        _starpu_mpi_common_recv(node, msg, len, NULL);
			
 
				+}
			
 
				+
			
 
				+/* SEND to any node */
			
 
				+void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event)
			
 
				+{   
			
 
				+        int res;
			
 
				+        int id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+        //printf("S_to_D send %d bytes from %d from %p\n", len, dst_devid, msg);
			
 
				+
			
 
				+        if (event)
			
 
				+        {
			
 
				+                /* Asynchronous send */
			
 
				+                struct _starpu_async_channel * channel = event;
			
 
				+                channel->event.mpi_ms_event.is_sender = 1;
			
 
				+
			
 
				+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
			
 
				+                if (channel->type == STARPU_UNUSED)
			
 
				+                        channel->event.mpi_ms_event.requests = NULL;
			
 
				+
			
 
				+                /* Initialize the list */
			
 
				+                if (channel->event.mpi_ms_event.requests == NULL)
			
 
				+                {
			
 
				+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
			
 
				+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
			
 
				+                }
			
 
				+
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
			
 
				+
			
 
				+                res = MPI_Isend(msg, len, MPI_BYTE, dst_devid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
			
 
				+
			
 
				+                channel->starpu_mp_common_finished_receiver++;
			
 
				+                channel->starpu_mp_common_finished_sender++;
			
 
				+
			
 
				+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
			
 
				+        } 
			
 
				+        else
			
 
				+        {
			
 
				+                /* Synchronous send */
			
 
				+                res = MPI_Send(msg, len, MPI_BYTE, dst_devid, SYNC_TAG, MPI_COMM_WORLD);
			
 
				+        }    
			
 
				+
			
 
				+        STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
			
 
				+}
			
 
				+
			
 
				+/* RECV to any node */
			
 
				+void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event)
			
 
				+{
			
 
				+        int res;
			
 
				+        int id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+
			
 
				+        //printf("R_to_D nop recv %d bytes from %d\n", len, src_devid);
			
 
				+
			
 
				+        if (event)
			
 
				+        {
			
 
				+                /* Asynchronous recv */
			
 
				+                struct _starpu_async_channel * channel = event;
			
 
				+                channel->event.mpi_ms_event.is_sender = 0;
			
 
				+
			
 
				+                /* call by sink, we need to initialize some parts, for host it's done in data_request.c */
			
 
				+                if (channel->type == STARPU_UNUSED)
			
 
				+                        channel->event.mpi_ms_event.requests = NULL;
			
 
				+
			
 
				+                /* Initialize the list */
			
 
				+                if (channel->event.mpi_ms_event.requests == NULL)
			
 
				+                {
			
 
				+                        channel->event.mpi_ms_event.requests = _starpu_mpi_ms_event_request_list_new();            
			
 
				+                        _starpu_mpi_ms_event_request_list_init(channel->event.mpi_ms_event.requests);
			
 
				+                }
			
 
				+
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_new();
			
 
				+
			
 
				+                res = MPI_Irecv(msg, len, MPI_BYTE, src_devid, ASYNC_TAG, MPI_COMM_WORLD, &req->request);
			
 
				+
			
 
				+                channel->starpu_mp_common_finished_receiver++;
			
 
				+                channel->starpu_mp_common_finished_sender++;
			
 
				+
			
 
				+                _starpu_mpi_ms_event_request_list_push_back(channel->event.mpi_ms_event.requests, req);
			
 
				+        } 
			
 
				+        else
			
 
				+        {
			
 
				+                /* Synchronous recv */
			
 
				+                MPI_Status s;
			
 
				+                res = MPI_Recv(msg, len, MPI_BYTE, src_devid, SYNC_TAG, MPI_COMM_WORLD, &s);
			
 
				+                int num_expected;
			
 
				+                MPI_Get_count(&s, MPI_BYTE, &num_expected);
			
 
				+
			
 
				+                STARPU_ASSERT_MSG(num_expected == len, "MPI Master/Slave received a msg with a size of %d Bytes (expected %d Bytes) !", num_expected, len);
			
 
				+                STARPU_ASSERT_MSG(res == MPI_SUCCESS, "MPI Master/Slave cannot receive a msg with a size of %d Bytes !", len);
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+static void _starpu_mpi_common_polling_node(struct _starpu_mp_node * node)
			
 
				+{
			
 
				+        /* poll the asynchronous messages.*/
			
 
				+        if (node != NULL)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&node->connection_mutex);
			
 
				+                while(node->mp_recv_is_ready(node))
			
 
				+                {
			
 
				+                        enum _starpu_mp_command answer;
			
 
				+                        void *arg;
			
 
				+                        int arg_size;
			
 
				+                        answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
			
 
				+                        if(!_starpu_src_common_store_message(node,arg,arg_size,answer))
			
 
				+                        {
			
 
				+                                printf("incorrect commande: unknown command or sync command");
			
 
				+                                STARPU_ASSERT(0);
			
 
				+                        }
			
 
				+                }
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&node->connection_mutex);
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+/* - In device to device communications, the first ack received by host
			
 
				+ * is considered as the sender (but it cannot be, in fact, the sender)
			
 
				+ */
			
 
				+int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
			
 
				+{
			
 
				+        if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
			
 
				+        {
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_list_begin(event->event.mpi_ms_event.requests);
			
 
				+                struct _starpu_mpi_ms_event_request * req_next;
			
 
				+
			
 
				+                while (req != _starpu_mpi_ms_event_request_list_end(event->event.mpi_ms_event.requests))
			
 
				+                {
			
 
				+                        req_next = _starpu_mpi_ms_event_request_list_next(req);
			
 
				+
			
 
				+                        int flag = 0;
			
 
				+                        MPI_Test(&req->request, &flag, MPI_STATUS_IGNORE);
			
 
				+                        if (flag)
			
 
				+                        {
			
 
				+                                _starpu_mpi_ms_event_request_list_erase(event->event.mpi_ms_event.requests, req);
			
 
				+                                _starpu_mpi_ms_event_request_delete(req);
			
 
				+
			
 
				+                                if (event->event.mpi_ms_event.is_sender)
			
 
				+                                        event->starpu_mp_common_finished_sender--;
			
 
				+                                else
			
 
				+                                        event->starpu_mp_common_finished_receiver--;
			
 
				+
			
 
				+                        }
			
 
				+                        req = req_next;
			
 
				+                }
			
 
				+
			
 
				+                /* When the list is empty, we finished to wait each request */
			
 
				+                if (_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
			
 
				+                {
			
 
				+                        /* Destroy the list */
			
 
				+                        _starpu_mpi_ms_event_request_list_delete(event->event.mpi_ms_event.requests);
			
 
				+                        event->event.mpi_ms_event.requests = NULL;
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        _starpu_mpi_common_polling_node(event->polling_node_sender);
			
 
				+        _starpu_mpi_common_polling_node(event->polling_node_receiver);
			
 
				+
			
 
				+        return !event->starpu_mp_common_finished_sender && !event->starpu_mp_common_finished_receiver;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* - In device to device communications, the first ack received by host
			
 
				+ * is considered as the sender (but it cannot be, in fact, the sender)
			
 
				+ */
			
 
				+void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event)
			
 
				+{
			
 
				+        if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
			
 
				+        {
			
 
				+                struct _starpu_mpi_ms_event_request * req = _starpu_mpi_ms_event_request_list_begin(event->event.mpi_ms_event.requests);
			
 
				+                struct _starpu_mpi_ms_event_request * req_next;
			
 
				+
			
 
				+                while (req != _starpu_mpi_ms_event_request_list_end(event->event.mpi_ms_event.requests))
			
 
				+                {
			
 
				+                        req_next = _starpu_mpi_ms_event_request_list_next(req);
			
 
				+
			
 
				+                        MPI_Wait(&req->request, MPI_STATUS_IGNORE);
			
 
				+                        _starpu_mpi_ms_event_request_list_erase(event->event.mpi_ms_event.requests, req);
			
 
				+
			
 
				+                        _starpu_mpi_ms_event_request_delete(req);
			
 
				+                        req = req_next;
			
 
				+
			
 
				+                        if (event->event.mpi_ms_event.is_sender)
			
 
				+                                event->starpu_mp_common_finished_sender--;
			
 
				+                        else
			
 
				+                                event->starpu_mp_common_finished_receiver--;
			
 
				+
			
 
				+                }
			
 
				+
			
 
				+                STARPU_ASSERT_MSG(_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests), "MPI Request list is not empty after a wait_event !");
			
 
				+
			
 
				+                /* Destroy the list */
			
 
				+                _starpu_mpi_ms_event_request_list_delete(event->event.mpi_ms_event.requests);
			
 
				+                event->event.mpi_ms_event.requests = NULL;
			
 
				+        }
			
 
				+
			
 
				+        //incoming ack from devices
			
 
				+        while(event->starpu_mp_common_finished_sender > 0 || event->starpu_mp_common_finished_receiver > 0)
			
 
				+        {
			
 
				+                _starpu_mpi_common_polling_node(event->polling_node_sender);
			
 
				+                _starpu_mpi_common_polling_node(event->polling_node_receiver);
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void _starpu_mpi_common_barrier(void)
			
 
				+{
			
 
				+        MPI_Barrier(MPI_COMM_WORLD);
			
 
				+}
			
 
				+
			
 
				+/* Compute bandwidth and latency between source and sink nodes
			
 
				+ * Source node has to have the entire set of times at the end
			
 
				+ */
			
 
				+void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
			
 
				+{
			
 
				+        int ret;
			
 
				+        unsigned iter;
			
 
				+
			
 
				+        int nb_proc, id_proc;
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &id_proc);
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &nb_proc);
			
 
				+
			
 
				+        char * buf;
			
 
				+        _STARPU_MALLOC(buf, SIZE_BANDWIDTH);
			
 
				+        memset(buf, 0, SIZE_BANDWIDTH);
			
 
				+
			
 
				+        unsigned sender, receiver;
			
 
				+        for(sender = 0; sender < nb_proc; sender++)
			
 
				+        {
			
 
				+                for(receiver = 0; receiver < nb_proc; receiver++) 
			
 
				+                {
			
 
				+                        MPI_Barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+                        //Node can't be a sender and a receiver
			
 
				+                        if(sender == receiver)
			
 
				+                                continue;
			
 
				+
			
 
				+                        if(id_proc == sender)
			
 
				+                        {
			
 
				+                                double start, end;
			
 
				+
			
 
				+                                /* measure bandwidth sender to receiver */
			
 
				+                                start = starpu_timing_now();
			
 
				+                                for (iter = 0; iter < NITER; iter++)
			
 
				+                                {
			
 
				+                                        ret = MPI_Send(buf, SIZE_BANDWIDTH, MPI_BYTE, receiver, 42, MPI_COMM_WORLD); 
			
 
				+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
			
 
				+                                }
			
 
				+                                end = starpu_timing_now();
			
 
				+                                bandwidth_dtod[sender][receiver] = (NITER*1000000)/(end - start);
			
 
				+
			
 
				+                                /* measure latency sender to receiver */
			
 
				+                                start = starpu_timing_now();
			
 
				+                                for (iter = 0; iter < NITER; iter++)
			
 
				+                                {
			
 
				+                                        ret = MPI_Send(buf, 1, MPI_BYTE, receiver, 42, MPI_COMM_WORLD); 
			
 
				+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Latency of MPI Master/Slave cannot be measured !");
			
 
				+                                }
			
 
				+                                end = starpu_timing_now();
			
 
				+                                latency_dtod[sender][receiver] = (end - start)/NITER;
			
 
				+                        }
			
 
				+
			
 
				+                        if (id_proc == receiver)
			
 
				+                        {
			
 
				+                                /* measure bandwidth sender to receiver*/
			
 
				+                                for (iter = 0; iter < NITER; iter++)
			
 
				+                                {
			
 
				+                                        ret = MPI_Recv(buf, SIZE_BANDWIDTH, MPI_BYTE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
			
 
				+                                }
			
 
				+
			
 
				+                                /* measure latency sender to receiver */
			
 
				+                                for (iter = 0; iter < NITER; iter++)
			
 
				+                                {
			
 
				+                                        ret = MPI_Recv(buf, 1, MPI_BYTE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+                                        STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
			
 
				+                                }
			
 
				+                        }
			
 
				+                }
			
 
				+
			
 
				+                /* When a sender finished its work, it has to send its results to the master */
			
 
				+
			
 
				+                /* Sender doesn't need to send to itself its data */
			
 
				+                if (sender == src_node_id)
			
 
				+                        continue;
			
 
				+
			
 
				+                /* if we are the sender, we send the data */
			
 
				+                if (sender == id_proc)
			
 
				+                {
			
 
				+                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
			
 
				+                        MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
			
 
				+                }
			
 
				+
			
 
				+                /* the master node receives the data */
			
 
				+                if (src_node_id == id_proc)
			
 
				+                {
			
 
				+                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+                        MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+                }
			
 
				+
			
 
				+        }
			
 
				+        free(buf);
			
 
				+}
			
--- a/src/drivers/mpi/driver_mpi_common.h
+++ b/src/drivers/mpi/driver_mpi_common.h
@@ -0,0 +1,59 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVER_MPI_COMMON_H__
			
 
				+#define __DRIVER_MPI_COMMON_H__
			
 
				+
			
 
				+#include <drivers/mp_common/mp_common.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+
			
 
				+#define SYNC_TAG 44
			
 
				+#define ASYNC_TAG 45
			
 
				+
			
 
				+int _starpu_mpi_common_mp_init();
			
 
				+void _starpu_mpi_common_mp_deinit();
			
 
				+
			
 
				+int _starpu_mpi_common_is_src_node();
			
 
				+int _starpu_mpi_common_get_src_node();
			
 
				+
			
 
				+int _starpu_mpi_common_is_mp_initialized();
			
 
				+int _starpu_mpi_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
			
 
				+
			
 
				+void _starpu_mpi_common_mp_initialize_src_sink(struct _starpu_mp_node *node);
			
 
				+
			
 
				+void _starpu_mpi_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				+void _starpu_mpi_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				+
			
 
				+void _starpu_mpi_common_mp_send(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+
			
 
				+void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
			
 
				+void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
			
 
				+
			
 
				+int _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
			
 
				+void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event);
			
 
				+
			
 
				+void _starpu_mpi_common_barrier(void);
			
 
				+
			
 
				+void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS]);
			
 
				+
			
 
				+
			
 
				+#endif  /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				+#endif	/* __DRIVER_MPI_COMMON_H__ */
			
--- a/src/drivers/mpi/driver_mpi_sink.c
+++ b/src/drivers/mpi/driver_mpi_sink.c
@@ -0,0 +1,81 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+#include <dlfcn.h>
			
 
				+
			
 
				+#include "driver_mpi_sink.h"
			
 
				+#include "driver_mpi_source.h"
			
 
				+#include "driver_mpi_common.h"
			
 
				+
			
 
				+void _starpu_mpi_sink_init(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+        _starpu_mpi_common_mp_initialize_src_sink(node);
			
 
				+
			
 
				+        _STARPU_MALLOC(node->thread_table, sizeof(starpu_pthread_t)*node->nb_cores);
			
 
				+        //TODO
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_sink_deinit(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+        free(node->thread_table);
			
 
				+        //TODO
			
 
				+}
			
 
				+
			
 
				+void (*_starpu_mpi_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
			
 
				+{
			
 
				+        void *dl_handle = dlopen(NULL, RTLD_NOW);
			
 
				+        return dlsym(dl_handle, func_name);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_sink_launch_workers(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+        //TODO
			
 
				+        int i, ret;
			
 
				+        struct arg_sink_thread * arg;
			
 
				+        cpu_set_t cpuset;
			
 
				+        starpu_pthread_attr_t attr;
			
 
				+        starpu_pthread_t thread;
			
 
				+
			
 
				+        for(i=0; i < node->nb_cores; i++)
			
 
				+        {
			
 
				+                //init the set
			
 
				+                CPU_ZERO(&cpuset);
			
 
				+                CPU_SET(i,&cpuset);
			
 
				+
			
 
				+                ret = starpu_pthread_attr_init(&attr);
			
 
				+                STARPU_ASSERT(ret == 0);
			
 
				+                ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
			
 
				+                STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+                /*prepare the argument for the thread*/
			
 
				+                _STARPU_MALLOC(arg, sizeof(struct arg_sink_thread));
			
 
				+                arg->coreid = i;
			
 
				+                arg->node = node;
			
 
				+
			
 
				+                ret = starpu_pthread_create(&thread, &attr, _starpu_sink_thread, arg);
			
 
				+                STARPU_ASSERT(ret == 0);
			
 
				+                ((starpu_pthread_t *)node->thread_table)[i] = thread;
			
 
				+
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core)
			
 
				+{
			
 
				+        //TODO
			
 
				+}
			
--- a/src/drivers/mpi/driver_mpi_sink.h
+++ b/src/drivers/mpi/driver_mpi_sink.h
@@ -0,0 +1,33 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVER_MPI_SINK_H__
			
 
				+#define __DRIVER_MPI_SINK_H__
			
 
				+
			
 
				+#include <drivers/mp_common/sink_common.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+
			
 
				+void _starpu_mpi_sink_init(struct _starpu_mp_node *node);
			
 
				+void _starpu_mpi_sink_deinit(struct _starpu_mp_node *node);
			
 
				+void _starpu_mpi_sink_launch_workers(struct _starpu_mp_node *node);
			
 
				+void _starpu_mpi_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core);
			
 
				+void (*_starpu_mpi_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void);
			
 
				+
			
 
				+#endif  /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				+#endif	/* __DRIVER_MPI_SINK_H__ */
			
--- a/src/drivers/mpi/driver_mpi_source.c
+++ b/src/drivers/mpi/driver_mpi_source.c
@@ -0,0 +1,343 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+#include <errno.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_common.h>
			
 
				+
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+
			
 
				+#include <drivers/driver_common/driver_common.h>
			
 
				+#include <drivers/mp_common/source_common.h>
			
 
				+
			
 
				+/* Mutex for concurrent access to the table.
			
 
				+ */
			
 
				+starpu_pthread_mutex_t htbl_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				+
			
 
				+/* Structure used by host to store informations about a kernel executable on
			
 
				+ * a MPI MS device : its name, and its address on each device.
			
 
				+ * If a kernel has been initialized, then a lookup has already been achieved and the
			
 
				+ * device knows how to call it, else the host still needs to do a lookup.
			
 
				+ */
			
 
				+struct _starpu_mpi_ms_kernel
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	char *name;
			
 
				+	starpu_mpi_ms_kernel_t func[STARPU_MAXMPIDEVS];
			
 
				+} *kernels;
			
 
				+
			
 
				+
			
 
				+/* Array of structures containing all the informations useful to send
			
 
				+ * and receive informations with devices */
			
 
				+struct _starpu_mp_node *mpi_ms_nodes[STARPU_MAXMPIDEVS];
			
 
				+
			
 
				+void _starpu_mpi_source_init(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+        _starpu_mpi_common_mp_initialize_src_sink(node);
			
 
				+        //TODO
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_source_deinit(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+
			
 
				+}
			
 
				+
			
 
				+struct _starpu_mp_node *_starpu_mpi_src_get_mp_node_from_memory_node(int memory_node)
			
 
				+{
			
 
				+        int devid = _starpu_memory_node_get_devid(memory_node);
			
 
				+        STARPU_ASSERT_MSG(devid >= 0 && devid < STARPU_MAXMPIDEVS, "bogus devid %d for memory node %d\n", devid, memory_node);
			
 
				+
			
 
				+        return mpi_ms_nodes[devid];
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_src_allocate_memory(void ** addr, size_t size, unsigned memory_node)
			
 
				+{
			
 
				+        const struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(memory_node);
			
 
				+        return _starpu_src_common_allocate(mp_node, addr, size);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_source_free_memory(void *addr, unsigned memory_node)
			
 
				+{
			
 
				+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(memory_node);
			
 
				+        _starpu_src_common_free(mp_node, addr);
			
 
				+}
			
 
				+
			
 
				+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
 
				+ * node to the address pointed by DST in the DST_NODE memory node
			
 
				+ */
			
 
				+int _starpu_mpi_copy_ram_to_mpi_sync(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
			
 
				+{
			
 
				+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(dst_node);
			
 
				+        return _starpu_src_common_copy_host_to_sink_sync(mp_node, src, dst, size);
			
 
				+}   
			
 
				+ 
			
 
				+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
			
 
				+ * node to the address pointed by DST in the DST_NODE memory node
			
 
				+ */    
			
 
				+int _starpu_mpi_copy_mpi_to_ram_sync(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
			
 
				+{
			
 
				+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(src_node);
			
 
				+        return _starpu_src_common_copy_sink_to_host_sync(mp_node, src, dst, size);
			
 
				+}   
			
 
				+
			
 
				+int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size)
			
 
				+{
			
 
				+        return _starpu_src_common_copy_sink_to_sink_sync(_starpu_mpi_src_get_mp_node_from_memory_node(src_node),
			
 
				+                        _starpu_mpi_src_get_mp_node_from_memory_node(dst_node),
			
 
				+                        src, dst, size);
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event)
			
 
				+{
			
 
				+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(src_node);
			
 
				+        return _starpu_src_common_copy_sink_to_host_async(mp_node, src, dst, size, event);
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event)
			
 
				+{
			
 
				+        struct _starpu_mp_node *mp_node = _starpu_mpi_src_get_mp_node_from_memory_node(dst_node);
			
 
				+        return _starpu_src_common_copy_host_to_sink_async(mp_node, src, dst, size, event);
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event)
			
 
				+{
			
 
				+        return _starpu_src_common_copy_sink_to_sink_async(_starpu_mpi_src_get_mp_node_from_memory_node(src_node),
			
 
				+                        _starpu_mpi_src_get_mp_node_from_memory_node(dst_node),
			
 
				+                        src, dst, size, event);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int _starpu_mpi_ms_src_register_kernel(starpu_mpi_ms_func_symbol_t *symbol, const char *func_name)
			
 
				+{
			
 
				+        unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_LOCK(&htbl_mutex);
			
 
				+        struct _starpu_mpi_ms_kernel *kernel;
			
 
				+
			
 
				+        HASH_FIND_STR(kernels, func_name, kernel);
			
 
				+
			
 
				+        if (kernel != NULL)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
			
 
				+                // Function already in the table.
			
 
				+                *symbol = kernel;
			
 
				+                return 0;
			
 
				+        }
			
 
				+
			
 
				+        kernel = malloc(sizeof(*kernel));
			
 
				+        if (kernel == NULL)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
			
 
				+                return -ENOMEM;
			
 
				+        }
			
 
				+
			
 
				+        kernel->name = malloc(func_name_size);
			
 
				+        if (kernel->name == NULL)
			
 
				+        {
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
			
 
				+                free(kernel);
			
 
				+                return -ENOMEM;
			
 
				+        }
			
 
				+
			
 
				+        memcpy(kernel->name, func_name, func_name_size);
			
 
				+
			
 
				+        HASH_ADD_STR(kernels, name, kernel);
			
 
				+
			
 
				+        unsigned int nb_mpi_devices = _starpu_mpi_src_get_device_count();
			
 
				+        unsigned int i;
			
 
				+        for (i = 0; i < nb_mpi_devices; ++i)
			
 
				+                kernel->func[i] = NULL;
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
			
 
				+
			
 
				+        *symbol = kernel;
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+starpu_mpi_ms_kernel_t _starpu_mpi_ms_src_get_kernel(starpu_mpi_ms_func_symbol_t symbol)
			
 
				+{
			
 
				+        int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+        /* This function has to be called in the codelet only, by the thread
			
 
				+         * which will handle the task */
			
 
				+        if (workerid < 0)
			
 
				+                return NULL;
			
 
				+
			
 
				+        int devid = starpu_worker_get_devid(workerid);
			
 
				+
			
 
				+        struct _starpu_mpi_ms_kernel *kernel = symbol;
			
 
				+
			
 
				+        if (kernel->func[devid] == NULL)
			
 
				+        {
			
 
				+                struct _starpu_mp_node *node = mpi_ms_nodes[devid];
			
 
				+                int ret = _starpu_src_common_lookup(node, (void (**)(void))&kernel->func[devid], kernel->name);
			
 
				+                if (ret)
			
 
				+                        return NULL;
			
 
				+        }
			
 
				+
			
 
				+        return kernel->func[devid];
			
 
				+}
			
 
				+
			
 
				+void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void)
			
 
				+{
			
 
				+        starpu_mpi_ms_kernel_t kernel = NULL;
			
 
				+
			
 
				+        starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(j->task->cl, j->nimpl);
			
 
				+        if (func)
			
 
				+        {
			
 
				+                /* We execute the function contained in the codelet, it must return a
			
 
				+                 * pointer to the function to execute on the device, either specified
			
 
				+                 * directly by the user or by a call to starpu_mpi_ms_get_func().
			
 
				+                 */
			
 
				+                kernel = func();
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+                /* If user dont define any starpu_mpi_ms_fun_t in cl->mpi_ms_func we try to use
			
 
				+                 * cpu_func_name.
			
 
				+                 */
			
 
				+                const char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
			
 
				+                if (func_name)
			
 
				+                {
			
 
				+                        starpu_mpi_ms_func_symbol_t symbol;
			
 
				+
			
 
				+                        _starpu_mpi_ms_src_register_kernel(&symbol, func_name);
			
 
				+
			
 
				+                        kernel = _starpu_mpi_ms_src_get_kernel(symbol);
			
 
				+                }
			
 
				+        }
			
 
				+        STARPU_ASSERT(kernel);
			
 
				+
			
 
				+        return (void (*)(void))kernel;
			
 
				+}
			
 
				+
			
 
				+unsigned _starpu_mpi_src_get_device_count()
			
 
				+{
			
 
				+        int nb_mpi_devices;
			
 
				+
			
 
				+        if (!_starpu_mpi_common_is_mp_initialized())
			
 
				+                return 0;
			
 
				+
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &nb_mpi_devices);
			
 
				+
			
 
				+        //Remove one for master
			
 
				+        nb_mpi_devices = nb_mpi_devices - 1;
			
 
				+
			
 
				+        return nb_mpi_devices;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void *_starpu_mpi_src_worker(void *arg)
			
 
				+{
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+        struct _starpu_worker_set *worker_set_mpi = (struct _starpu_worker_set *) arg;
			
 
				+        int nbsinknodes = _starpu_mpi_src_get_device_count();
			
 
				+
			
 
				+        int workersetnum;
			
 
				+        for (workersetnum = 0; workersetnum < nbsinknodes; workersetnum++)
			
 
				+        {
			
 
				+                struct _starpu_worker_set * worker_set = &worker_set_mpi[workersetnum];
			
 
				+#else
			
 
				+                struct _starpu_worker_set *worker_set = arg;
			
 
				+#endif
			
 
				+
			
 
				+                /* As all workers of a set share common data, we just use the first
			
 
				+                 *       * one for intializing the following stuffs. */
			
 
				+                struct _starpu_worker *baseworker = &worker_set->workers[0];
			
 
				+                struct _starpu_machine_config *config = baseworker->config;
			
 
				+                unsigned baseworkerid = baseworker - config->workers;
			
 
				+                unsigned devid = baseworker->devid;
			
 
				+                unsigned i;
			
 
				+
			
 
				+                /* unsigned memnode = baseworker->memory_node; */
			
 
				+
			
 
				+                _starpu_driver_start(baseworker, _STARPU_FUT_MPI_KEY, 0);
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT             
			
 
				+                for (i = 1; i < worker_set->nworkers; i++)
			
 
				+                        _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_MPI_KEY, 0);
			
 
				+#endif          
			
 
				+
			
 
				+                // Current task for a thread managing a worker set has no sense.
			
 
				+                _starpu_set_current_task(NULL);
			
 
				+
			
 
				+                for (i = 0; i < config->topology.nmpicores[devid]; i++)
			
 
				+                {
			
 
				+                        struct _starpu_worker *worker = &config->workers[baseworkerid+i];
			
 
				+                        snprintf(worker->name, sizeof(worker->name), "MPI_MS %d core %u", devid, i);
			
 
				+                        snprintf(worker->short_name, sizeof(worker->short_name), "MPI_MS %d.%u", devid, i);
			
 
				+                }
			
 
				+
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+                {
			
 
				+                        char thread_name[16];
			
 
				+                        snprintf(thread_name, sizeof(thread_name), "MPI_MS");
			
 
				+                        starpu_pthread_setname(thread_name);
			
 
				+                }
			
 
				+#else
			
 
				+                {
			
 
				+                        char thread_name[16];
			
 
				+                        snprintf(thread_name, sizeof(thread_name), "MPI_MS %d", devid);
			
 
				+                        starpu_pthread_setname(thread_name);
			
 
				+                }
			
 
				+#endif
			
 
				+
			
 
				+                for (i = 0; i < worker_set->nworkers; i++)
			
 
				+                {
			
 
				+                        struct _starpu_worker *worker = &worker_set->workers[i];
			
 
				+                        _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
			
 
				+                }
			
 
				+
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+                _starpu_src_common_init_switch_env(workersetnum);
			
 
				+        }  /* for */
			
 
				+
			
 
				+        /* set the worker zero for the main thread */
			
 
				+        for (workersetnum = 0; workersetnum < nbsinknodes; workersetnum++)
			
 
				+        {
			
 
				+                struct _starpu_worker_set * worker_set = &worker_set_mpi[workersetnum];
			
 
				+                struct _starpu_worker *baseworker = &worker_set->workers[0];
			
 
				+#endif
			
 
				+
			
 
				+                /* tell the main thread that this one is ready */
			
 
				+                STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
			
 
				+                baseworker->status = STATUS_UNKNOWN;
			
 
				+                worker_set->set_is_initialized = 1;
			
 
				+                STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
			
 
				+                STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
			
 
				+
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+        }
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				+        _starpu_src_common_workers_set(worker_set_mpi, nbsinknodes, mpi_ms_nodes);
			
 
				+#else
			
 
				+        _starpu_src_common_worker(worker_set, baseworkerid, mpi_ms_nodes[devid]);
			
 
				+#endif
			
 
				+
			
 
				+        return NULL;
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/drivers/mpi/driver_mpi_source.h
+++ b/src/drivers/mpi/driver_mpi_source.h
@@ -0,0 +1,52 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Mathieu Lirzin <mthl@openmailbox.org>
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVER_MPI_SOURCE_H__
			
 
				+#define __DRIVER_MPI_SOURCE_H__
			
 
				+
			
 
				+#include <drivers/mp_common/mp_common.h>
			
 
				+#include <starpu_mpi_ms.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+
			
 
				+/* Array of structures containing all the informations useful to send
			
 
				+ * and receive informations with devices */
			
 
				+extern struct _starpu_mp_node *mpi_ms_nodes[STARPU_MAXMPIDEVS];
			
 
				+struct _starpu_mp_node *_starpu_mpi_src_get_mp_node_from_memory_node(int memory_node);
			
 
				+
			
 
				+unsigned _starpu_mpi_src_get_device_count();
			
 
				+void *_starpu_mpi_src_worker(void *arg);
			
 
				+
			
 
				+void _starpu_mpi_source_init(struct _starpu_mp_node *node);
			
 
				+void _starpu_mpi_source_deinit(struct _starpu_mp_node *node);
			
 
				+
			
 
				+int _starpu_mpi_src_allocate_memory(void ** addr, size_t size, unsigned memory_node);
			
 
				+void _starpu_mpi_source_free_memory(void *addr, unsigned memory_node);
			
 
				+
			
 
				+int _starpu_mpi_copy_mpi_to_ram_sync(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
			
 
				+int _starpu_mpi_copy_ram_to_mpi_sync(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
			
 
				+int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size);
			
 
				+
			
 
				+int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event);
			
 
				+int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
			
 
				+int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
			
 
				+
			
 
				+void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);
			
 
				+
			
 
				+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				+
			
 
				+#endif	/* __DRIVER_MPI_SOURCE_H__ */
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -756,15 +756,13 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
				 	if (!idle)
			
 
				 	{
			
 
				 		/* Not ready yet, no better thing to do than waiting */
			
 
				-		__starpu_datawizard_progress(memnode, 1, 0);
			
 
				-		__starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
			
 
				+		__starpu_datawizard_progress(1, 0);
			
 
				 		return 0;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	res = !idle;
			
 
				-	res |= __starpu_datawizard_progress(memnode, 1, 1);
			
 
				-	res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
			
 
				+	res |= __starpu_datawizard_progress(1, 1);
			
 
				 
			
 
				 	task = _starpu_get_worker_task(worker, workerid, memnode);
			
 
				 
			
--- a/src/drivers/scc/driver_scc_common.c
+++ b/src/drivers/scc/driver_scc_common.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -141,6 +141,8 @@ void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
			
 
				+
			
 
				 	/* There are potentially 48 threads running on the master core and RCCE_send write
			
 
				 	 * data in the MPB associated to this core. It's not thread safe, so we have to protect it.
			
 
				 	 * RCCE_acquire_lock uses a test&set register on SCC. */
			
@@ -155,8 +157,10 @@ void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int
 
				 	RCCE_release_lock(RCCE_ue());
			
 
				 }
			
 
				 
			
 
				-void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event)
			
 
				 {
			
 
				+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
			
 
				+
			
 
				 	int ret;
			
 
				 	if ((ret = RCCE_recv(msg, len, node->mp_connection.scc_nodeid)) != RCCE_SUCCESS)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
			
--- a/src/drivers/scc/driver_scc_common.h
+++ b/src/drivers/scc/driver_scc_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -39,8 +39,8 @@ int _starpu_scc_common_is_mp_initialized();
 
				 int _starpu_scc_common_get_src_node_id();
			
 
				 int _starpu_scc_common_is_src_node();
			
 
				 
			
 
				-void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				-void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				+void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len, void * event);
			
 
				 
			
 
				 void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
			
 
				 
			
--- a/src/drivers/scc/driver_scc_sink.c
+++ b/src/drivers/scc/driver_scc_sink.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -58,6 +58,9 @@ void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
 
				 void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len)
			
 
				 {
			
 
				 	int ret;
			
 
				+
			
 
				+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
			
 
				+
			
 
				 	if ((ret = RCCE_send(msg, len, STARPU_TO_SCC_SINK_ID(dst_devid))) != RCCE_SUCCESS)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
			
 
				 }
			
@@ -65,6 +68,9 @@ void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst
 
				 void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len)
			
 
				 {
			
 
				 	int ret;
			
 
				+
			
 
				+        STARPU_ASSERT_MSG(!event, "Asynchronous msg is not used here");
			
 
				+
			
 
				 	if ((ret = RCCE_recv(msg, len, STARPU_TO_SCC_SINK_ID(src_devid))) != RCCE_SUCCESS)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
			
 
				 }
			
--- a/src/drivers/scc/driver_scc_sink.h
+++ b/src/drivers/scc/driver_scc_sink.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,8 +28,8 @@ void _starpu_scc_sink_init(struct _starpu_mp_node *node);
 
				 void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node);
			
 
				 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
			
 
				 
			
 
				-void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
			
 
				-void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
			
 
				+void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
			
 
				+void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
			
 
				 
			
 
				 void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, starpu_pthread_t *thread);
			
 
				 
			
--- a/src/drivers/scc/driver_scc_source.c
+++ b/src/drivers/scc/driver_scc_source.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  INRIA
			
 
				+ * Copyright (C) 2012, 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -259,7 +259,7 @@ void _starpu_scc_set_offset_in_shared_memory(void *ptr, void **dev_handle, size_
 
				  */
			
 
				 int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
			
 
				 {
			
 
				-	return _starpu_src_common_copy_host_to_sink(_starpu_scc_src_memory_node_to_mp_node(dst_node),
			
 
				+	return _starpu_src_common_copy_host_to_sink_sync(_starpu_scc_src_memory_node_to_mp_node(dst_node),
			
 
				 			src, dst, size);
			
 
				 }
			
 
				 
			
@@ -268,13 +268,13 @@ int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_U
 
				  */
			
 
				 int _starpu_scc_copy_sink_to_src(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
			
 
				 {
			
 
				-	return _starpu_src_common_copy_sink_to_host(_starpu_scc_src_memory_node_to_mp_node(src_node),
			
 
				+	return _starpu_src_common_copy_sink_to_host_sync(_starpu_scc_src_memory_node_to_mp_node(src_node),
			
 
				 			src, dst, size);
			
 
				 }
			
 
				 
			
 
				 int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size)
			
 
				 {
			
 
				-	return _starpu_src_common_copy_sink_to_sink(_starpu_scc_src_memory_node_to_mp_node(src_node),
			
 
				+	return _starpu_src_common_copy_sink_to_sink_sync(_starpu_scc_src_memory_node_to_mp_node(src_node),
			
 
				 			_starpu_scc_src_memory_node_to_mp_node(dst_node),
			
 
				 			src, dst, size);
			
 
				 }
			
--- a/src/starpu_parameters.h
+++ b/src/starpu_parameters.h
@@ -27,4 +27,5 @@
 
				 #define _STARPU_OPENCL_ALPHA	12.22f
			
 
				 #define _STARPU_MIC_ALPHA	0.5f
			
 
				 #define _STARPU_SCC_ALPHA	1.0f
			
 
				+#define _STARPU_MPI_MS_ALPHA	1.0f
			
 
				 #endif /* _STARPU_PARAMETERS_H */
			
--- a/src/top/starpu_top.c
+++ b/src/top/starpu_top.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony Roy
			
 
				  * Copyright (C) 2011, 2012, 2013, 2016 CNRS
			
 
				+ * Copyright (C) 2016  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -111,6 +112,9 @@ static void starpu_top_get_device_type(int id, char* type)
 
				 	case STARPU_SCC_WORKER:
			
 
				 		strncpy(type, "SCC", 9);
			
 
				 		break;
			
 
				+	case STARPU_MPI_WORKER:
			
 
				+		strncpy(type, "MPI", 9);
			
 
				+		break;
			
 
				 	}
			
 
				 	type[9] = 0;
			
 
				 }
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -110,11 +110,15 @@ LOADER			=
 
				 LOADER_BIN		=	$(top_builddir)/tests/loader-cross.sh
			
 
				 endif
			
 
				 
			
 
				+if STARPU_USE_MPI_MASTER_SLAVE
			
 
				+MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+endif
			
 
				+
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER		=	$(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT   =   top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+LOG_COMPILER        =   $(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT   =   top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 endif
			
 
				 
			
--- a/tests/datawizard/copy.c
+++ b/tests/datawizard/copy.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011, 2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2016  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -64,7 +65,7 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0 &&
			
 
				-		starpu_worker_get_count_by_type(STARPU_MIC_WORKER) == 0)
			
 
				+		starpu_worker_get_count_by_type(STARPU_MIC_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_MPI_WORKER) == 0)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "This application requires a CUDA , OpenCL or MIC Worker\n");
			
 
				 		starpu_shutdown();
			
--- a/tests/datawizard/manual_reduction.c
+++ b/tests/datawizard/manual_reduction.c
@@ -175,6 +175,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_conf_init(&conf);
			
 
				 	conf.nmic = 0;
			
 
				+	conf.nmpi_ms = 0;
			
 
				 	conf.nscc = 0;
			
 
				 
			
 
				 	variable = INIT_VALUE;
			
--- a/tests/errorcheck/starpu_init_noworker.c
+++ b/tests/errorcheck/starpu_init_noworker.c
@@ -57,22 +57,27 @@ int main(int argc, char **argv)
 
				 	conf.nopencl = 0;
			
 
				 	conf.nmic = 0;
			
 
				 	conf.nscc = 0;
			
 
				+        conf.nmpi_ms = 0;
			
 
				 
			
 
				 	/* starpu_init should return -ENODEV */
			
 
				-	ret = starpu_initialize(&conf, &argc, &argv);
			
 
				-	if (ret == -ENODEV)
			
 
				-	     return EXIT_SUCCESS;
			
 
				-	else
			
 
				-	{
			
 
				-	     	unsigned ncpu = starpu_cpu_worker_get_count();
			
 
				-		unsigned ncuda = starpu_cuda_worker_get_count();
			
 
				-		unsigned nopencl = starpu_opencl_worker_get_count();
			
 
				-		FPRINTF(stderr, "StarPU has found :\n");
			
 
				-		FPRINTF(stderr, "\t%u CPU cores\n", ncpu);
			
 
				-		FPRINTF(stderr, "\t%u CUDA devices\n", ncuda);
			
 
				-		FPRINTF(stderr, "\t%u OpenCL devices\n", nopencl);
			
 
				-		return EXIT_FAILURE;
			
 
				-	}
			
 
				+        ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+        if (ret == -ENODEV)
			
 
				+                return EXIT_SUCCESS;
			
 
				+        else
			
 
				+        {
			
 
				+                unsigned ncpu = starpu_cpu_worker_get_count();
			
 
				+                unsigned ncuda = starpu_cuda_worker_get_count();
			
 
				+                unsigned nopencl = starpu_opencl_worker_get_count();
			
 
				+                unsigned nmic = starpu_mic_worker_get_count();
			
 
				+                unsigned nmpi_ms = starpu_mpi_ms_worker_get_count();
			
 
				+                FPRINTF(stderr, "StarPU has found :\n");
			
 
				+                FPRINTF(stderr, "\t%u CPU cores\n", ncpu);
			
 
				+                FPRINTF(stderr, "\t%u CUDA devices\n", ncuda);
			
 
				+                FPRINTF(stderr, "\t%u OpenCL devices\n", nopencl);
			
 
				+                FPRINTF(stderr, "\t%u MIC devices\n", nmic);
			
 
				+                FPRINTF(stderr, "\t%u MPI Master-Slaves devices\n", nmpi_ms);
			
 
				+                return EXIT_FAILURE;
			
 
				+        }
			
 
				 
			
 
				 
			
 
				 }
			
--- a/tests/perfmodels/valid_model.c
+++ b/tests/perfmodels/valid_model.c
@@ -105,6 +105,7 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 
				 
			
 
				 	// We need to call starpu_init again to initialise values used by perfmodels
			
 
				 	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	char path[256];